This commit is contained in:
Andrey Gumirov
2024-01-11 02:44:53 +07:00
parent 8780822f95
commit b6a619303d
6 changed files with 201 additions and 57 deletions

View File

@ -5,4 +5,6 @@ add_executable(main
bitstream.h
huffman_table.cpp
huffman_table.h
MashZip.cpp
MashZip.h
main.cpp)

View File

@ -1,4 +1,14 @@
#include "huffman_table.h"
#include <istream>
#include <fstream>
#define MAGIC "MASH"
class MashZip {
public:
MashZip() {};
void unmashzip_stream(std::basic_istream<char> &is, std::basic_ostream<char> &os);
void mashzip_file(std::basic_istream<char> &cs, std::basic_istream<char> &is, std::basic_ostream<char> &os);
};

View File

@ -6,11 +6,16 @@
// i.e when stream is 0b12345678 0bABCDEFGH and command is
// to read 12 bits:
// out = 00000000 00000000 0000EFGH 12345678
// out = 00000000 00000000 0000EFGH 12345678 -- wrong
// to read 2 bits:
// out = 00000000 00000000 00000000 00000078
// TODO:
// to read 12 bits:
// out = 00000000 00000000 00008765 4321HGFE
// to read 2 bits:
// out = 00000000 00000000 00000000 00000087
int ibitstream::getbits(size_t n) {
std::cerr << "To read " << n << " count: " << this->count << " " << this->cache << std::endl;
// std::cerr << "To read " << n << " count: " << this->count << " " << this->cache << std::endl;
int out = 0, read = 0, to_read;
if (n > 32 || n < 0) {
@ -25,16 +30,29 @@ int ibitstream::getbits(size_t n) {
while (n > 0) {
to_read = min(n, 8 - this->count);
std::cerr << "Iter n: " << n << " count: " << this->count << " "
<< this->cache << " to_read: " << to_read << " already read: " << read << std::endl;
// std::cerr << "Iter n: " << n << " count: " << this->count << " "
// << this->cache << " to_read: " << to_read << " already read: " << read << std::endl;
// cache & 0b11111000 if count = 3;
// cache & 0b10000000 if count = 7;
// cache & 0b11111111 if count = 0, etc;
uint8_t mask = (((1 << to_read) - 1) << this->count);
out |= ((cache & mask) >> this->count) << read;
uint8_t chunk = ((cache & mask) >> this->count);
std::cerr << "Read result: " << std::bitset<8>((cache & mask) >> this->count) << " " << std::bitset<32>(((cache & mask) >> this->count) << read) << std::endl;
// todo inverse chunk
uint8_t inv = 0;
for (size_t i = 0; i < to_read; i++)
{
inv |= ((chunk >> i) & 1) << (to_read - i - 1);
}
out <<= to_read; // shift by length of chunk
out |= inv; // concat with chunk
// out |= inv << read;
// std::cerr << "Mask " << std::bitset<8>(mask) << " chunk " << std::bitset<8>(chunk) <<
// " inv " << std::bitset<8>(inv) << " out " << std::bitset<32>(out) << std::endl;
this->count += to_read;
read += to_read;
@ -51,13 +69,33 @@ int ibitstream::getbits(size_t n) {
}
void obitstream::writebits(short bits, size_t n) {
std::cerr << "Write: " << std::bitset<16>(bits) << " n: " << n << " count: " << this->count << " cache: " << std::bitset<8>(this->cache) << std::endl;
// std::cerr << "Write: " << std::bitset<16>(bits) << " n: " << n << " count: " << this->count << " cache: " << std::bitset<8>(this->cache) << std::endl;
int written = 0, to_write;
while (n > 0) {
to_write = min(n, 8 - this->count);
// mask:
// ------- n -----------------------------
// 00000..00000 1...1111111111 000000..000 == bits
// - written - | - to_write - | - offset -
// so offset = n - to_write
// because n = n - to_write at every step, so written is already present
uint8_t chunk = (bits & (((1 << to_write) - 1) << written)) >> written;
this->cache |= (chunk << this->count);
uint16_t mask = (((1 << to_write) - 1) << (n - to_write));
uint8_t chunk = (bits & mask) >> (n - to_write);
// todo inverse bits in chunk
uint8_t inv = 0;
for (size_t i = 0; i < to_write; i++)
{
inv |= ((chunk >> i) & 1) << (to_write - i - 1);
}
// std::cerr << "Chunk " << std::bitset<8>(chunk) << " inv " << std::bitset<8>(inv) << " to_write "
// << to_write << " written " << written << " n " << n << " mask " << std::bitset<16>(mask)
// << " offset " << (n - to_write) << std::endl;
this->cache |= (inv << this->count);
this->count += to_write;
written += to_write;
@ -65,7 +103,7 @@ void obitstream::writebits(short bits, size_t n) {
if (this->count == 8){
// flush chunk
std::cerr << "Flush: " << std::bitset<8>(this->cache) << std::endl;
// std::cerr << "Flush: " << std::bitset<8>(this->cache) << std::endl;
os << this->cache;
os.flush();
@ -78,7 +116,7 @@ void obitstream::writebits(short bits, size_t n) {
}
void obitstream::flush() {
std::cerr << "Flush: " << std::bitset<8>(this->cache) << std::endl;
// std::cerr << "Flush: " << std::bitset<8>(this->cache) << std::endl;
os << this->cache;
os.flush();

View File

@ -6,22 +6,32 @@
#include <queue>
#include <iostream>
#define HEADER_SIZE 128
void initialize_table(const std::map<int, std::set<char> > &huffmanLengths,
std::unordered_map<char, std::pair<int, short> > &table)
void initialize_table(const int sumLen,
const std::map<int, std::set<char> > &huffmanLengths,
std::unordered_map<char, std::pair<int, short> > &codingTable,
std::vector<char> &symbols,
std::array<int, MAX_LEN> &counts)
{
int nextbl = 0;
int nextbl = 0, offset = 0; // offset = total offset to symbols of current len
short code = 0;
symbols.resize(sumLen);
counts.fill(0);
// std::cerr << "Sum len " << sumLen << std::endl;
for (auto lenCodePairIt = huffmanLengths.begin(); lenCodePairIt != huffmanLengths.end(); lenCodePairIt++)
{
int cnt = 0; // counter of symbols of current code length
auto lenCodePair = *lenCodePairIt;
counts[lenCodePair.first] = lenCodePair.second.size();
// std::cerr << "Counts[" << lenCodePair.first << "] " << counts[lenCodePair.first] << std::endl;
for (auto it = lenCodePair.second.begin(); it != lenCodePair.second.end(); it++)
{
table[*it].first = lenCodePair.first; // save current bit length for code
table[*it].second = code;
codingTable[*it].first = lenCodePair.first; // save current bit length for code
codingTable[*it].second = code;
// code := (code + 1) << ((bit length of the next symbol) (current bit length))
// code++;
@ -33,15 +43,20 @@ void initialize_table(const std::map<int, std::set<char> > &huffmanLengths,
nextbl = lenCodePair.first;
}
// std::cerr << "symbols[" << offset + cnt << "] =" << (*it) << " code " << std::bitset<16>(code) << std::endl;
symbols[offset + cnt] = *it;
code = (code + 1) << (nextbl - lenCodePair.first);
cnt++;
}
offset += cnt;
// code <<= 1;
}
}
HuffmanTable::HuffmanTable(uint8_t *header) {
int cnt1, cnt2;
HuffmanTable::HuffmanTable(const char *header) {
int cnt1, cnt2, total_cnt = 0;
std::map<int, std::set<char> > huffmanLengths;
for (int i = 0; i < HEADER_SIZE; i++) {
@ -49,13 +64,15 @@ HuffmanTable::HuffmanTable(uint8_t *header) {
cnt2 = (header[i] & 0b1111);
if (cnt1 != 0) huffmanLengths[cnt1].insert((char)(i * 2));
if (cnt2 != 0) huffmanLengths[cnt2].insert((char)(i * 2 + 1));
total_cnt += cnt1 + cnt2;
}
// build up codes
initialize_table(huffmanLengths, this->huffmanCodes);
initialize_table(total_cnt, huffmanLengths, this->huffmanCodes, this->symbols, this->counts);
}
void get_lengths(Node* root, int len,
void get_lengths(Node* root, int len, int &cnt,
std::map<int, std::set<char> > &huffmanLengths)
{
if (!root)
@ -65,11 +82,12 @@ void get_lengths(Node* root, int len,
if (root->isLeaf()) {
// huffmanCode[root->ch] = str;
// std::cerr << "Got leaf: " << root->getChar() << std::endl;
cnt++;
huffmanLengths[len].insert(root->getChar());
}
get_lengths(root->getLeft(), len + 1, huffmanLengths);
get_lengths(root->getRight(), len + 1, huffmanLengths);
get_lengths(root->getLeft(), len + 1, cnt, huffmanLengths);
get_lengths(root->getRight(), len + 1, cnt, huffmanLengths);
}
HuffmanTable::HuffmanTable(std::basic_istream<char> &is) {
@ -81,7 +99,7 @@ HuffmanTable::HuffmanTable(std::basic_istream<char> &is) {
freq[ch]++;
}
std::cerr << "Calculated freqs" << std::endl;
// std::cerr << "Calculated freqs" << std::endl;
// Create a priority queue to store live nodes of
// Huffman tree;
@ -94,7 +112,7 @@ HuffmanTable::HuffmanTable(std::basic_istream<char> &is) {
pq.push(new_node);
}
std::cerr << "Filled PQ: " << pq.size() << std::endl;
// std::cerr << "Filled PQ: " << pq.size() << std::endl;
// do till there is more than one node in the queue
while (pq.size() != 1)
@ -119,11 +137,20 @@ HuffmanTable::HuffmanTable(std::basic_istream<char> &is) {
Node* root = pq.top();
std::map<int, std::set<char> > huffmanLengths;
get_lengths(root, 0, huffmanLengths);
int total_cnt = 0;
get_lengths(root, 0, total_cnt, huffmanLengths);
// std::cerr << "Got lengths: " << huffmanLengths.size() << std::endl;
initialize_table(huffmanLengths, this->huffmanCodes);
initialize_table(total_cnt, huffmanLengths, this->huffmanCodes, this->symbols, this->counts);
// for (auto s : this->symbols) {
// std::cerr << "Symbol " << s << std::endl;
// }
// for (int i = 0; i < this->counts.size(); i++) {
// std::cerr << "Count for len " << i << " " << this->counts[i] << std::endl;
// }
}
std::pair<int, short> HuffmanTable::operator[](const char &c) {
@ -133,11 +160,45 @@ std::pair<int, short> HuffmanTable::operator[](const char &c) {
void HuffmanTable::write_symbol(obitstream &os, const char &c) {
if (huffmanCodes.find(c) == huffmanCodes.end()) throw std::runtime_error("No code in table for char!");
std::cerr << "Write code for " << c << " " << (int)c << " : " << std::bitset<16>(huffmanCodes[c].second) << " " << " len " << huffmanCodes[c].first << std::endl;
os.writebits(huffmanCodes[c].second, huffmanCodes[c].first);
}
uint8_t *HuffmanTable::to_header() {
uint8_t *header = new uint8_t[HEADER_SIZE];
int HuffmanTable::decode_one_symbol(ibitstream &bs)
{
uint16_t code = 0;
int len = 1, first = 0, index = 0;
while (len <= MAX_LEN) {
// read one bit
uint16_t bit = (uint16_t) bs.getbits(1);
code |= bit;
int count = this->counts[len];
// std::cerr << "Read bit " << bit << " code " << std::bitset<16>(code) << " len " << len <<
// " first " << std::bitset<16>(first) << " index " << index << " count " << count << std::endl;
if (code < first + count) {
return this->symbols[index + (code - first)];
}
index += count;
first += count;
first <<= 1;
code <<= 1;
len++;
}
return -1;
}
char *HuffmanTable::to_header() {
char *header = new char[HEADER_SIZE];
for (size_t i = 0; i < HEADER_SIZE; i++)
{

View File

@ -3,6 +3,9 @@
#include "bitstream.h"
#define HEADER_SIZE 128
#define MAX_LEN 16
#ifndef HUFFMAN_TABLE
#define HUFFMAN_TABLE
@ -10,16 +13,18 @@ class HuffmanTable
{
private:
std::unordered_map<char, std::pair<int, short> > huffmanCodes;
std::vector<char> symbols;
std::array<int, 16> counts;
public:
// Given the list of code lengths length[0..n-1] representing a canonical
// Huffman code for n symbols, construct the tables required to decode those
// codes.
HuffmanTable(uint8_t *header);
HuffmanTable(const char *header);
// Build from input stream
HuffmanTable(std::basic_istream<char> &is);
uint8_t *to_header();
char *to_header();
std::pair<int, short> operator[](const char &c);

View File

@ -9,6 +9,7 @@
#include "bitstream.h"
#include "huffman_table.h"
#include "MashZip.h"
using namespace std;
@ -201,18 +202,18 @@ int main(int argc, char **argv)
// // 10111010 11110101
// // ^^
// int a = ibs.getbits(2);
// cout << bitset<2>(a) << endl;
// cout << bitset<2>(a) << endl; // 01
// // 10111010 11110101
// // ^^^^
// int b = ibs.getbits(4);
// cout << bitset<4>(b) << endl;
// cout << bitset<4>(b) << endl; // 0111
// // 10111010 11110101
// // ^^ ^^^^^^^
// b = ibs.getbits(9);
// cout << bitset<9>(b) << endl;
// cout << bitset<9>(b) << endl; // 011010111
// // 10111010 11110101
// // ^ + overflow
@ -230,11 +231,13 @@ int main(int argc, char **argv)
// cout << "After 3 bits: " << so.str() << endl;
// // cache here: 00000111
// obs.writebits(0xf5, 7);
// // here: Flush: 10101111
// // de = 11011110
// obs.writebits(0xde, 7);
// // here: Flush: 11101111
// cout << "After 7 bits: " << so.str() << endl;
// obs.flush();
// // here: Flush: 00000011
// // here: Flush: 00000001
// cout << "After flush: " << so.str() << endl;
@ -245,34 +248,59 @@ int main(int argc, char **argv)
// cout << "After flush: " << so.str() << endl;
string s = "Some long text!!!!\x01\x02\x03\x04";
// string s = ;
stringstream ss1(s);
stringstream ss1("Some long text!!!!\x01\x02\x03\x04\n123123456789 privet, masha!");
stringstream ss2("Some long text!!!!\x01\x02\x03\x04\n123123456789 privet, masha!");
stringstream so1;
stringstream so2;
HuffmanTable ht(ss1);
MashZip mz;
uint8_t *header = ht.to_header();
// for (size_t i = 0; i < 128; i++)
// {
// cout << "Code for " << 2 * i << " and " << 2 * i + 1 << ": " << bitset<8>(header[i]) << endl;
// }
mz.mashzip_file(ss1, ss2, so1);
std::cout << "After mashzip: " << so1.str();
mz.unmashzip_stream(so1, so2);
std::cout << "After unmashzip: " << so2.str();
// HuffmanTable ht(ss1);
// char *header = ht.to_header();
// // for (size_t i = 0; i < 128; i++)
// // {
// // cout << "Code for " << 2 * i << " and " << 2 * i + 1 << ": " << bitset<8>(header[i]) << endl;
// // }
ostringstream test;
// stringstream test;
test << "MASH"; // magic
// // test << "MASH"; // magic
for (size_t i = 0; i < 128; i++) {
test << header[i];
}
// // for (size_t i = 0; i < 128; i++) {
// // test << header[i];
// // }
obitstream some_stream(test);
// obitstream some_stream(test);
for (char c : s) {
ht.write_symbol(some_stream, c);
}
some_stream.flush();
// for (char c : s) {
// ht.write_symbol(some_stream, c);
// }
// some_stream.flush();
std::cout << test.str();
// std::cout << test.str() << endl;
// ibitstream some_instream(test);
// while(true) {
// char sym = (char) ht.decode_one_symbol(some_instream);
// if (sym < 0)
// {
// std::cout << "Read all" << std::endl;
// break;
// }
// std::cerr << "Sym: " << sym << std::endl;
// }
return 0;
}