WIP: MVP
This commit is contained in:
@ -5,4 +5,6 @@ add_executable(main
|
||||
bitstream.h
|
||||
huffman_table.cpp
|
||||
huffman_table.h
|
||||
MashZip.cpp
|
||||
MashZip.h
|
||||
main.cpp)
|
10
MashZip.h
10
MashZip.h
@ -1,4 +1,14 @@
|
||||
#include "huffman_table.h"
|
||||
|
||||
#include <istream>
|
||||
#include <fstream>
|
||||
|
||||
#define MAGIC "MASH"
|
||||
|
||||
class MashZip {
|
||||
public:
|
||||
MashZip() {};
|
||||
|
||||
void unmashzip_stream(std::basic_istream<char> &is, std::basic_ostream<char> &os);
|
||||
void mashzip_file(std::basic_istream<char> &cs, std::basic_istream<char> &is, std::basic_ostream<char> &os);
|
||||
};
|
@ -6,11 +6,16 @@
|
||||
|
||||
// i.e when stream is 0b12345678 0bABCDEFGH and command is
|
||||
// to read 12 bits:
|
||||
// out = 00000000 00000000 0000EFGH 12345678
|
||||
// out = 00000000 00000000 0000EFGH 12345678 -- wrong
|
||||
// to read 2 bits:
|
||||
// out = 00000000 00000000 00000000 00000078
|
||||
// TODO:
|
||||
// to read 12 bits:
|
||||
// out = 00000000 00000000 00008765 4321HGFE
|
||||
// to read 2 bits:
|
||||
// out = 00000000 00000000 00000000 00000087
|
||||
int ibitstream::getbits(size_t n) {
|
||||
std::cerr << "To read " << n << " count: " << this->count << " " << this->cache << std::endl;
|
||||
// std::cerr << "To read " << n << " count: " << this->count << " " << this->cache << std::endl;
|
||||
int out = 0, read = 0, to_read;
|
||||
|
||||
if (n > 32 || n < 0) {
|
||||
@ -25,16 +30,29 @@ int ibitstream::getbits(size_t n) {
|
||||
|
||||
while (n > 0) {
|
||||
to_read = min(n, 8 - this->count);
|
||||
std::cerr << "Iter n: " << n << " count: " << this->count << " "
|
||||
<< this->cache << " to_read: " << to_read << " already read: " << read << std::endl;
|
||||
// std::cerr << "Iter n: " << n << " count: " << this->count << " "
|
||||
// << this->cache << " to_read: " << to_read << " already read: " << read << std::endl;
|
||||
|
||||
// cache & 0b11111000 if count = 3;
|
||||
// cache & 0b10000000 if count = 7;
|
||||
// cache & 0b11111111 if count = 0, etc;
|
||||
uint8_t mask = (((1 << to_read) - 1) << this->count);
|
||||
out |= ((cache & mask) >> this->count) << read;
|
||||
uint8_t chunk = ((cache & mask) >> this->count);
|
||||
|
||||
std::cerr << "Read result: " << std::bitset<8>((cache & mask) >> this->count) << " " << std::bitset<32>(((cache & mask) >> this->count) << read) << std::endl;
|
||||
// todo inverse chunk
|
||||
uint8_t inv = 0;
|
||||
for (size_t i = 0; i < to_read; i++)
|
||||
{
|
||||
inv |= ((chunk >> i) & 1) << (to_read - i - 1);
|
||||
}
|
||||
|
||||
out <<= to_read; // shift by length of chunk
|
||||
out |= inv; // concat with chunk
|
||||
|
||||
// out |= inv << read;
|
||||
|
||||
// std::cerr << "Mask " << std::bitset<8>(mask) << " chunk " << std::bitset<8>(chunk) <<
|
||||
// " inv " << std::bitset<8>(inv) << " out " << std::bitset<32>(out) << std::endl;
|
||||
|
||||
this->count += to_read;
|
||||
read += to_read;
|
||||
@ -51,13 +69,33 @@ int ibitstream::getbits(size_t n) {
|
||||
}
|
||||
|
||||
void obitstream::writebits(short bits, size_t n) {
|
||||
std::cerr << "Write: " << std::bitset<16>(bits) << " n: " << n << " count: " << this->count << " cache: " << std::bitset<8>(this->cache) << std::endl;
|
||||
// std::cerr << "Write: " << std::bitset<16>(bits) << " n: " << n << " count: " << this->count << " cache: " << std::bitset<8>(this->cache) << std::endl;
|
||||
int written = 0, to_write;
|
||||
while (n > 0) {
|
||||
to_write = min(n, 8 - this->count);
|
||||
|
||||
// mask:
|
||||
// ------- n -----------------------------
|
||||
// 00000..00000 1...1111111111 000000..000 == bits
|
||||
// - written - | - to_write - | - offset -
|
||||
// so offset = n - to_write
|
||||
// because n = n - to_write at every step, so written is already present
|
||||
|
||||
uint8_t chunk = (bits & (((1 << to_write) - 1) << written)) >> written;
|
||||
this->cache |= (chunk << this->count);
|
||||
uint16_t mask = (((1 << to_write) - 1) << (n - to_write));
|
||||
uint8_t chunk = (bits & mask) >> (n - to_write);
|
||||
|
||||
// todo inverse bits in chunk
|
||||
uint8_t inv = 0;
|
||||
for (size_t i = 0; i < to_write; i++)
|
||||
{
|
||||
inv |= ((chunk >> i) & 1) << (to_write - i - 1);
|
||||
}
|
||||
|
||||
// std::cerr << "Chunk " << std::bitset<8>(chunk) << " inv " << std::bitset<8>(inv) << " to_write "
|
||||
// << to_write << " written " << written << " n " << n << " mask " << std::bitset<16>(mask)
|
||||
// << " offset " << (n - to_write) << std::endl;
|
||||
|
||||
this->cache |= (inv << this->count);
|
||||
|
||||
this->count += to_write;
|
||||
written += to_write;
|
||||
@ -65,7 +103,7 @@ void obitstream::writebits(short bits, size_t n) {
|
||||
|
||||
if (this->count == 8){
|
||||
// flush chunk
|
||||
std::cerr << "Flush: " << std::bitset<8>(this->cache) << std::endl;
|
||||
// std::cerr << "Flush: " << std::bitset<8>(this->cache) << std::endl;
|
||||
|
||||
os << this->cache;
|
||||
os.flush();
|
||||
@ -78,7 +116,7 @@ void obitstream::writebits(short bits, size_t n) {
|
||||
}
|
||||
|
||||
void obitstream::flush() {
|
||||
std::cerr << "Flush: " << std::bitset<8>(this->cache) << std::endl;
|
||||
// std::cerr << "Flush: " << std::bitset<8>(this->cache) << std::endl;
|
||||
os << this->cache;
|
||||
os.flush();
|
||||
|
||||
|
@ -6,22 +6,32 @@
|
||||
#include <queue>
|
||||
#include <iostream>
|
||||
|
||||
#define HEADER_SIZE 128
|
||||
|
||||
void initialize_table(const std::map<int, std::set<char> > &huffmanLengths,
|
||||
std::unordered_map<char, std::pair<int, short> > &table)
|
||||
void initialize_table(const int sumLen,
|
||||
const std::map<int, std::set<char> > &huffmanLengths,
|
||||
std::unordered_map<char, std::pair<int, short> > &codingTable,
|
||||
std::vector<char> &symbols,
|
||||
std::array<int, MAX_LEN> &counts)
|
||||
{
|
||||
int nextbl = 0;
|
||||
int nextbl = 0, offset = 0; // offset = total offset to symbols of current len
|
||||
short code = 0;
|
||||
|
||||
symbols.resize(sumLen);
|
||||
counts.fill(0);
|
||||
|
||||
// std::cerr << "Sum len " << sumLen << std::endl;
|
||||
|
||||
for (auto lenCodePairIt = huffmanLengths.begin(); lenCodePairIt != huffmanLengths.end(); lenCodePairIt++)
|
||||
{
|
||||
int cnt = 0; // counter of symbols of current code length
|
||||
auto lenCodePair = *lenCodePairIt;
|
||||
|
||||
counts[lenCodePair.first] = lenCodePair.second.size();
|
||||
// std::cerr << "Counts[" << lenCodePair.first << "] " << counts[lenCodePair.first] << std::endl;
|
||||
|
||||
for (auto it = lenCodePair.second.begin(); it != lenCodePair.second.end(); it++)
|
||||
{
|
||||
table[*it].first = lenCodePair.first; // save current bit length for code
|
||||
table[*it].second = code;
|
||||
codingTable[*it].first = lenCodePair.first; // save current bit length for code
|
||||
codingTable[*it].second = code;
|
||||
|
||||
// code := (code + 1) << ((bit length of the next symbol) − (current bit length))
|
||||
// code++;
|
||||
@ -33,15 +43,20 @@ void initialize_table(const std::map<int, std::set<char> > &huffmanLengths,
|
||||
nextbl = lenCodePair.first;
|
||||
}
|
||||
|
||||
// std::cerr << "symbols[" << offset + cnt << "] =" << (*it) << " code " << std::bitset<16>(code) << std::endl;
|
||||
symbols[offset + cnt] = *it;
|
||||
|
||||
code = (code + 1) << (nextbl - lenCodePair.first);
|
||||
cnt++;
|
||||
}
|
||||
|
||||
offset += cnt;
|
||||
// code <<= 1;
|
||||
}
|
||||
}
|
||||
|
||||
HuffmanTable::HuffmanTable(uint8_t *header) {
|
||||
int cnt1, cnt2;
|
||||
HuffmanTable::HuffmanTable(const char *header) {
|
||||
int cnt1, cnt2, total_cnt = 0;
|
||||
std::map<int, std::set<char> > huffmanLengths;
|
||||
|
||||
for (int i = 0; i < HEADER_SIZE; i++) {
|
||||
@ -49,13 +64,15 @@ HuffmanTable::HuffmanTable(uint8_t *header) {
|
||||
cnt2 = (header[i] & 0b1111);
|
||||
if (cnt1 != 0) huffmanLengths[cnt1].insert((char)(i * 2));
|
||||
if (cnt2 != 0) huffmanLengths[cnt2].insert((char)(i * 2 + 1));
|
||||
|
||||
total_cnt += cnt1 + cnt2;
|
||||
}
|
||||
|
||||
// build up codes
|
||||
initialize_table(huffmanLengths, this->huffmanCodes);
|
||||
initialize_table(total_cnt, huffmanLengths, this->huffmanCodes, this->symbols, this->counts);
|
||||
}
|
||||
|
||||
void get_lengths(Node* root, int len,
|
||||
void get_lengths(Node* root, int len, int &cnt,
|
||||
std::map<int, std::set<char> > &huffmanLengths)
|
||||
{
|
||||
if (!root)
|
||||
@ -65,11 +82,12 @@ void get_lengths(Node* root, int len,
|
||||
if (root->isLeaf()) {
|
||||
// huffmanCode[root->ch] = str;
|
||||
// std::cerr << "Got leaf: " << root->getChar() << std::endl;
|
||||
cnt++;
|
||||
huffmanLengths[len].insert(root->getChar());
|
||||
}
|
||||
|
||||
get_lengths(root->getLeft(), len + 1, huffmanLengths);
|
||||
get_lengths(root->getRight(), len + 1, huffmanLengths);
|
||||
get_lengths(root->getLeft(), len + 1, cnt, huffmanLengths);
|
||||
get_lengths(root->getRight(), len + 1, cnt, huffmanLengths);
|
||||
}
|
||||
|
||||
HuffmanTable::HuffmanTable(std::basic_istream<char> &is) {
|
||||
@ -81,7 +99,7 @@ HuffmanTable::HuffmanTable(std::basic_istream<char> &is) {
|
||||
freq[ch]++;
|
||||
}
|
||||
|
||||
std::cerr << "Calculated freqs" << std::endl;
|
||||
// std::cerr << "Calculated freqs" << std::endl;
|
||||
|
||||
// Create a priority queue to store live nodes of
|
||||
// Huffman tree;
|
||||
@ -94,7 +112,7 @@ HuffmanTable::HuffmanTable(std::basic_istream<char> &is) {
|
||||
pq.push(new_node);
|
||||
}
|
||||
|
||||
std::cerr << "Filled PQ: " << pq.size() << std::endl;
|
||||
// std::cerr << "Filled PQ: " << pq.size() << std::endl;
|
||||
|
||||
// do till there is more than one node in the queue
|
||||
while (pq.size() != 1)
|
||||
@ -119,11 +137,20 @@ HuffmanTable::HuffmanTable(std::basic_istream<char> &is) {
|
||||
Node* root = pq.top();
|
||||
|
||||
std::map<int, std::set<char> > huffmanLengths;
|
||||
get_lengths(root, 0, huffmanLengths);
|
||||
int total_cnt = 0;
|
||||
get_lengths(root, 0, total_cnt, huffmanLengths);
|
||||
|
||||
// std::cerr << "Got lengths: " << huffmanLengths.size() << std::endl;
|
||||
|
||||
initialize_table(huffmanLengths, this->huffmanCodes);
|
||||
initialize_table(total_cnt, huffmanLengths, this->huffmanCodes, this->symbols, this->counts);
|
||||
|
||||
// for (auto s : this->symbols) {
|
||||
// std::cerr << "Symbol " << s << std::endl;
|
||||
// }
|
||||
|
||||
// for (int i = 0; i < this->counts.size(); i++) {
|
||||
// std::cerr << "Count for len " << i << " " << this->counts[i] << std::endl;
|
||||
// }
|
||||
}
|
||||
|
||||
std::pair<int, short> HuffmanTable::operator[](const char &c) {
|
||||
@ -133,11 +160,45 @@ std::pair<int, short> HuffmanTable::operator[](const char &c) {
|
||||
void HuffmanTable::write_symbol(obitstream &os, const char &c) {
|
||||
if (huffmanCodes.find(c) == huffmanCodes.end()) throw std::runtime_error("No code in table for char!");
|
||||
|
||||
std::cerr << "Write code for " << c << " " << (int)c << " : " << std::bitset<16>(huffmanCodes[c].second) << " " << " len " << huffmanCodes[c].first << std::endl;
|
||||
|
||||
os.writebits(huffmanCodes[c].second, huffmanCodes[c].first);
|
||||
}
|
||||
|
||||
uint8_t *HuffmanTable::to_header() {
|
||||
uint8_t *header = new uint8_t[HEADER_SIZE];
|
||||
int HuffmanTable::decode_one_symbol(ibitstream &bs)
|
||||
{
|
||||
uint16_t code = 0;
|
||||
int len = 1, first = 0, index = 0;
|
||||
|
||||
while (len <= MAX_LEN) {
|
||||
// read one bit
|
||||
uint16_t bit = (uint16_t) bs.getbits(1);
|
||||
|
||||
code |= bit;
|
||||
|
||||
|
||||
int count = this->counts[len];
|
||||
|
||||
// std::cerr << "Read bit " << bit << " code " << std::bitset<16>(code) << " len " << len <<
|
||||
// " first " << std::bitset<16>(first) << " index " << index << " count " << count << std::endl;
|
||||
|
||||
|
||||
if (code < first + count) {
|
||||
return this->symbols[index + (code - first)];
|
||||
}
|
||||
|
||||
index += count;
|
||||
first += count;
|
||||
first <<= 1;
|
||||
code <<= 1;
|
||||
len++;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
char *HuffmanTable::to_header() {
|
||||
char *header = new char[HEADER_SIZE];
|
||||
|
||||
for (size_t i = 0; i < HEADER_SIZE; i++)
|
||||
{
|
||||
|
@ -3,6 +3,9 @@
|
||||
|
||||
#include "bitstream.h"
|
||||
|
||||
#define HEADER_SIZE 128
|
||||
#define MAX_LEN 16
|
||||
|
||||
#ifndef HUFFMAN_TABLE
|
||||
#define HUFFMAN_TABLE
|
||||
|
||||
@ -10,16 +13,18 @@ class HuffmanTable
|
||||
{
|
||||
private:
|
||||
std::unordered_map<char, std::pair<int, short> > huffmanCodes;
|
||||
std::vector<char> symbols;
|
||||
std::array<int, 16> counts;
|
||||
public:
|
||||
// Given the list of code lengths length[0..n-1] representing a canonical
|
||||
// Huffman code for n symbols, construct the tables required to decode those
|
||||
// codes.
|
||||
HuffmanTable(uint8_t *header);
|
||||
HuffmanTable(const char *header);
|
||||
|
||||
// Build from input stream
|
||||
HuffmanTable(std::basic_istream<char> &is);
|
||||
|
||||
uint8_t *to_header();
|
||||
char *to_header();
|
||||
|
||||
std::pair<int, short> operator[](const char &c);
|
||||
|
||||
|
78
main.cpp
78
main.cpp
@ -9,6 +9,7 @@
|
||||
|
||||
#include "bitstream.h"
|
||||
#include "huffman_table.h"
|
||||
#include "MashZip.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -201,18 +202,18 @@ int main(int argc, char **argv)
|
||||
// // 10111010 11110101
|
||||
// // ^^
|
||||
// int a = ibs.getbits(2);
|
||||
// cout << bitset<2>(a) << endl;
|
||||
// cout << bitset<2>(a) << endl; // 01
|
||||
|
||||
// // 10111010 11110101
|
||||
// // ^^^^
|
||||
// int b = ibs.getbits(4);
|
||||
// cout << bitset<4>(b) << endl;
|
||||
// cout << bitset<4>(b) << endl; // 0111
|
||||
|
||||
|
||||
// // 10111010 11110101
|
||||
// // ^^ ^^^^^^^
|
||||
// b = ibs.getbits(9);
|
||||
// cout << bitset<9>(b) << endl;
|
||||
// cout << bitset<9>(b) << endl; // 011010111
|
||||
|
||||
// // 10111010 11110101
|
||||
// // ^ + overflow
|
||||
@ -230,11 +231,13 @@ int main(int argc, char **argv)
|
||||
// cout << "After 3 bits: " << so.str() << endl;
|
||||
// // cache here: 00000111
|
||||
|
||||
// obs.writebits(0xf5, 7);
|
||||
// // here: Flush: 10101111
|
||||
|
||||
// // de = 11011110
|
||||
// obs.writebits(0xde, 7);
|
||||
// // here: Flush: 11101111
|
||||
// cout << "After 7 bits: " << so.str() << endl;
|
||||
// obs.flush();
|
||||
// // here: Flush: 00000011
|
||||
// // here: Flush: 00000001
|
||||
// cout << "After flush: " << so.str() << endl;
|
||||
|
||||
|
||||
@ -245,34 +248,59 @@ int main(int argc, char **argv)
|
||||
// cout << "After flush: " << so.str() << endl;
|
||||
|
||||
|
||||
string s = "Some long text!!!!\x01\x02\x03\x04";
|
||||
// string s = ;
|
||||
|
||||
stringstream ss1(s);
|
||||
stringstream ss1("Some long text!!!!\x01\x02\x03\x04\n123123456789 privet, masha!");
|
||||
stringstream ss2("Some long text!!!!\x01\x02\x03\x04\n123123456789 privet, masha!");
|
||||
stringstream so1;
|
||||
stringstream so2;
|
||||
|
||||
HuffmanTable ht(ss1);
|
||||
MashZip mz;
|
||||
|
||||
uint8_t *header = ht.to_header();
|
||||
// for (size_t i = 0; i < 128; i++)
|
||||
// {
|
||||
// cout << "Code for " << 2 * i << " and " << 2 * i + 1 << ": " << bitset<8>(header[i]) << endl;
|
||||
// }
|
||||
mz.mashzip_file(ss1, ss2, so1);
|
||||
|
||||
std::cout << "After mashzip: " << so1.str();
|
||||
|
||||
mz.unmashzip_stream(so1, so2);
|
||||
|
||||
std::cout << "After unmashzip: " << so2.str();
|
||||
|
||||
// HuffmanTable ht(ss1);
|
||||
|
||||
// char *header = ht.to_header();
|
||||
// // for (size_t i = 0; i < 128; i++)
|
||||
// // {
|
||||
// // cout << "Code for " << 2 * i << " and " << 2 * i + 1 << ": " << bitset<8>(header[i]) << endl;
|
||||
// // }
|
||||
|
||||
ostringstream test;
|
||||
// stringstream test;
|
||||
|
||||
test << "MASH"; // magic
|
||||
// // test << "MASH"; // magic
|
||||
|
||||
for (size_t i = 0; i < 128; i++) {
|
||||
test << header[i];
|
||||
}
|
||||
// // for (size_t i = 0; i < 128; i++) {
|
||||
// // test << header[i];
|
||||
// // }
|
||||
|
||||
obitstream some_stream(test);
|
||||
// obitstream some_stream(test);
|
||||
|
||||
for (char c : s) {
|
||||
ht.write_symbol(some_stream, c);
|
||||
}
|
||||
some_stream.flush();
|
||||
// for (char c : s) {
|
||||
// ht.write_symbol(some_stream, c);
|
||||
// }
|
||||
// some_stream.flush();
|
||||
|
||||
std::cout << test.str();
|
||||
// std::cout << test.str() << endl;
|
||||
|
||||
// ibitstream some_instream(test);
|
||||
|
||||
// while(true) {
|
||||
// char sym = (char) ht.decode_one_symbol(some_instream);
|
||||
// if (sym < 0)
|
||||
// {
|
||||
// std::cout << "Read all" << std::endl;
|
||||
// break;
|
||||
// }
|
||||
// std::cerr << "Sym: " << sym << std::endl;
|
||||
// }
|
||||
|
||||
return 0;
|
||||
}
|
Reference in New Issue
Block a user