chore(CPlusPlus): added Huffman (#468)
Signed-off-by: Kareim Tarek AbdelAzeem <kareimgazar1990@gmail.com> Co-authored-by: Ankur <54987647+Arsenic-ATG@users.noreply.github.com>pull/472/head
parent
2def163eae
commit
b330bdb75d
|
@ -0,0 +1,23 @@
|
||||||
|
# Huffman Code
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
The goal is minimize the size of files using the fact that some characters are redundant and with higher frequency than others like the word the, so the goal is to use less chars to type the most common words and more chars for the least common words.
|
||||||
|
|
||||||
|
## Get Started
|
||||||
|
|
||||||
|
In the `main.cpp` file you can find a running example and the results can be seen in details phase by phase in the results folder
|
||||||
|
|
||||||
|
## The trick
|
||||||
|
|
||||||
|
The problem with this Huffman codes is that it uses the Concept of **make the common case faster** regrading the frequency of words in the file which is a variable of the file itself.
|
||||||
|
So for better performance the program records the frequency of each word in the file before processing, providing the algorithm with the best prediction.
|
||||||
|
|
||||||
|
## performance
|
||||||
|
|
||||||
|
This algorithms here decrease the files size to half on average.
|
||||||
|
personally, I used it in this program to minimize my xml and json files.
|
||||||
|
|
||||||
|
## Application
|
||||||
|
|
||||||
|
Mainly this algorithm is used to decrease files size from text to audio.
|
|
@ -0,0 +1,284 @@
|
||||||
|
// this will requrire one to change the the source directory in the build system
|
||||||
|
#include "algorithms/CPlusPlus/Trees/huffmanCode/huffman.h"
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include <fstream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <iterator>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <map>
|
||||||
|
#include <bitset>
|
||||||
|
#include <utility>
|
||||||
|
#include <regex> // NOLINT (build/c++11)
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
map<char, int> frequency;
|
||||||
|
class Node{
|
||||||
|
public:
|
||||||
|
Node * parent;
|
||||||
|
Node * left;
|
||||||
|
Node * right;
|
||||||
|
char key;
|
||||||
|
int freq;
|
||||||
|
string name;
|
||||||
|
Node(Node * p, Node * l, Node * r, int k, int f)
|
||||||
|
{
|
||||||
|
parent = p;
|
||||||
|
left = l;
|
||||||
|
right = r;
|
||||||
|
key = k;
|
||||||
|
freq = f;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
void postOrder(Node * root)
|
||||||
|
{
|
||||||
|
if(root->left != nullptr) postOrder(root->left);
|
||||||
|
//cout << root->key << " ";
|
||||||
|
if(root->right != nullptr) postOrder(root->right);
|
||||||
|
if(root->left == nullptr && root->right == nullptr) cout << root->key << " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
bool cmp(pair<char, int>& a,
|
||||||
|
pair<char, int>& b)
|
||||||
|
{
|
||||||
|
return a.second < b.second;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool compare(Node * a, Node * b)
|
||||||
|
{
|
||||||
|
return a->freq < b->freq;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Function to sort the map according
|
||||||
|
// to value in a (key-value) pairs
|
||||||
|
vector<pair<char, int> > sortChars(map<char, int>& M)
|
||||||
|
{
|
||||||
|
vector<pair<char, int> > result;
|
||||||
|
for (auto& it : M) {
|
||||||
|
result.push_back(it);
|
||||||
|
}
|
||||||
|
sort(result.begin(), result.end(), cmp);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void updateMap(string textLine)
|
||||||
|
{
|
||||||
|
for(char c : textLine)
|
||||||
|
{
|
||||||
|
frequency[c] +=1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<pair<char, int> > getAlphabet(string alphabetSource)
|
||||||
|
{
|
||||||
|
string text;
|
||||||
|
ifstream MyReadFile(alphabetSource);
|
||||||
|
while (getline (MyReadFile, text)) {
|
||||||
|
//frequency['\n'] +=1;
|
||||||
|
updateMap(text);
|
||||||
|
}
|
||||||
|
MyReadFile.close();
|
||||||
|
vector<pair<char, int> > results = sortChars(frequency);
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
Node * getBinary(Node * root, char key, string & binaryCode)
|
||||||
|
{
|
||||||
|
if(root->key==key) return root;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if(root->left->name.find(key) != string::npos)
|
||||||
|
{
|
||||||
|
//cout << "0";
|
||||||
|
binaryCode += "0";
|
||||||
|
return getBinary(root->left, key, binaryCode);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//cout << "1";
|
||||||
|
binaryCode += "1";
|
||||||
|
return getBinary(root->right, key, binaryCode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
string extract(Node * root, string text)
|
||||||
|
{
|
||||||
|
Node * head = root;
|
||||||
|
string result = "";
|
||||||
|
for(int i=0; i<text.size(); i++)
|
||||||
|
{
|
||||||
|
if(text[i]=='0') head = head->left;
|
||||||
|
else head = head->right;
|
||||||
|
|
||||||
|
if(head->left== nullptr && head->right== nullptr)
|
||||||
|
{
|
||||||
|
result.push_back(head->key);
|
||||||
|
head = root;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
string code(Node * root, string text)
|
||||||
|
{
|
||||||
|
string result = "";
|
||||||
|
|
||||||
|
for(int i=0; i<text.size(); i++)
|
||||||
|
{
|
||||||
|
string binaryCode = "";
|
||||||
|
getBinary(root, text[i], binaryCode);
|
||||||
|
result += binaryCode;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
int codedBinay(string binaryText, string binaryResultFilePath)
|
||||||
|
{
|
||||||
|
int intNumber = 0;
|
||||||
|
ofstream binaryResultFile(binaryResultFilePath, ios::out | ios::binary);
|
||||||
|
ofstream integersFile("results/integers.txt"); // for debugging
|
||||||
|
int base = 0;
|
||||||
|
int offset = 31; // should be 31
|
||||||
|
string target = "";
|
||||||
|
|
||||||
|
while(base+offset<binaryText.size())
|
||||||
|
{
|
||||||
|
target = binaryText.substr(base, offset);
|
||||||
|
integersFile << target << endl;
|
||||||
|
int outInteger = stoi(target, nullptr, 2);
|
||||||
|
integersFile << outInteger << endl;
|
||||||
|
intNumber++;
|
||||||
|
binaryResultFile.write( (char*)&outInteger, sizeof(int));
|
||||||
|
base += offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(base != binaryText.size()-1)
|
||||||
|
{
|
||||||
|
target = binaryText.substr(base, binaryText.size()-base);
|
||||||
|
integersFile << target << endl;
|
||||||
|
int outInteger = stoi(target, 0, 2);
|
||||||
|
integersFile << outInteger << endl;
|
||||||
|
intNumber++;
|
||||||
|
binaryResultFile.write( (char*)&outInteger, sizeof(int));
|
||||||
|
}
|
||||||
|
binaryResultFile.close();
|
||||||
|
return intNumber;
|
||||||
|
}
|
||||||
|
|
||||||
|
void minifyFile(string filePath, string modifiedSourceFilePath)
|
||||||
|
{
|
||||||
|
string text = "";
|
||||||
|
string textLine = "";
|
||||||
|
ifstream MyReadFile(filePath);
|
||||||
|
regex r("\\s{2,}");
|
||||||
|
while (getline(MyReadFile, textLine)) {
|
||||||
|
text += regex_replace(textLine, r, " ");
|
||||||
|
}
|
||||||
|
MyReadFile.close();
|
||||||
|
|
||||||
|
ofstream outFile(modifiedSourceFilePath);
|
||||||
|
outFile << text;
|
||||||
|
outFile.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
Node * buildCodeTree(string alphabetSourceFilePath)
|
||||||
|
{
|
||||||
|
vector<pair<char, int>> results = getAlphabet(alphabetSourceFilePath);
|
||||||
|
vector<Node *> nodes;
|
||||||
|
for(auto result : results)
|
||||||
|
{
|
||||||
|
Node * node = new Node(nullptr, nullptr, nullptr, result.first, result.second);
|
||||||
|
node->name = node->key;
|
||||||
|
nodes.push_back(node);
|
||||||
|
}
|
||||||
|
|
||||||
|
while(nodes.size()>1)
|
||||||
|
{
|
||||||
|
Node * right = nodes[0];
|
||||||
|
Node * left = nodes[1];
|
||||||
|
nodes.erase(nodes.begin());
|
||||||
|
nodes.erase(nodes.begin());
|
||||||
|
Node * parent = new Node(nullptr, left, right, '\0', left->freq+right->freq);
|
||||||
|
parent->name += left->name + right->name;
|
||||||
|
nodes.push_back(parent);
|
||||||
|
sort(nodes.begin(), nodes.end(), compare);
|
||||||
|
}
|
||||||
|
return nodes[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
void writeCodedText(string modifiedSourceFilePath, string textCodeFilePath, Node * root)
|
||||||
|
{
|
||||||
|
string sourceFileLine;
|
||||||
|
ifstream sourceFile(modifiedSourceFilePath);
|
||||||
|
ofstream distenationFile(textCodeFilePath);
|
||||||
|
|
||||||
|
while (getline (sourceFile, sourceFileLine)) {
|
||||||
|
// Output the text from the file
|
||||||
|
string textLine = sourceFileLine ; // not adding "\n" leaves only white spaces and results in one string
|
||||||
|
string codedText = code(root, textLine);
|
||||||
|
distenationFile << codedText;
|
||||||
|
}
|
||||||
|
sourceFile.close();
|
||||||
|
distenationFile.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
void extractCodedText(string textCodeFilePath, string decodedFilePath, Node * root)
|
||||||
|
{
|
||||||
|
string textCodeFileLine;
|
||||||
|
ifstream textCodeFile(textCodeFilePath);
|
||||||
|
ofstream decodedFile(decodedFilePath);
|
||||||
|
|
||||||
|
while (getline (textCodeFile, textCodeFileLine)) {
|
||||||
|
string decodedText = extract(root, textCodeFileLine);
|
||||||
|
decodedFile << decodedText; // this i think mostly appends no new lines
|
||||||
|
}
|
||||||
|
textCodeFile.close();
|
||||||
|
decodedFile.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
int codeBinaryFromText(string textCodeFilePath, string binaryResultFilePath)
|
||||||
|
{
|
||||||
|
string sourceTextCodedFileLine;
|
||||||
|
ifstream sourceTextCodedFile(textCodeFilePath);
|
||||||
|
int linesNum = 0;
|
||||||
|
while (getline(sourceTextCodedFile, sourceTextCodedFileLine)) {
|
||||||
|
linesNum = codedBinay(sourceTextCodedFileLine, binaryResultFilePath);
|
||||||
|
}
|
||||||
|
sourceTextCodedFile.close();
|
||||||
|
return linesNum;
|
||||||
|
}
|
||||||
|
|
||||||
|
void decompressBinaryFile(string binarySourceFilePath, string textDistnationFilePath,int linesNum, Node * alphabetRoot)
|
||||||
|
{
|
||||||
|
ifstream binaryResultFile(binarySourceFilePath, ios::out | ios::binary);
|
||||||
|
ofstream decodedbinaryFile(textDistnationFilePath);
|
||||||
|
int codedNumber;
|
||||||
|
string allText = "";
|
||||||
|
for(int i=0; i<linesNum; i++)
|
||||||
|
{
|
||||||
|
binaryResultFile.read((char *) &codedNumber, sizeof(int));
|
||||||
|
string str = bitset<31>(codedNumber).to_string();
|
||||||
|
allText += str;
|
||||||
|
}
|
||||||
|
string decodedText = extract(alphabetRoot, allText);
|
||||||
|
decodedbinaryFile << decodedText; // this i think mostly appends no new lines
|
||||||
|
binaryResultFile.close();
|
||||||
|
decodedbinaryFile.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
int compresstoBinaryFile(string modifiedSourceFilePath, Node * alphabetRoot)
|
||||||
|
{
|
||||||
|
string textCodeFilePath = "results/textCoded.txt";
|
||||||
|
string decodedTextFilePath = "results/decodedText.txt";
|
||||||
|
string binaryResultFilePath = "results/BinaryCoded.dat";
|
||||||
|
|
||||||
|
writeCodedText(modifiedSourceFilePath, textCodeFilePath, alphabetRoot);
|
||||||
|
extractCodedText(textCodeFilePath, decodedTextFilePath, alphabetRoot);
|
||||||
|
int linesNum = codeBinaryFromText(textCodeFilePath, binaryResultFilePath);
|
||||||
|
return linesNum;
|
||||||
|
}
|
|
@ -0,0 +1,38 @@
|
||||||
|
#ifndef ALGORITHMS_CPLUSPLUS_TREES_HUFFMANCODE_HUFFMAN_H_
|
||||||
|
#define ALGORITHMS_CPLUSPLUS_TREES_HUFFMANCODE_HUFFMAN_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
class Node{
|
||||||
|
public:
|
||||||
|
Node * parent;
|
||||||
|
Node * left;
|
||||||
|
Node * right;
|
||||||
|
char key;
|
||||||
|
int freq;
|
||||||
|
string name;
|
||||||
|
Node(Node * p, Node * l, Node * r, int k, int f)
|
||||||
|
{
|
||||||
|
parent = p;
|
||||||
|
left = l;
|
||||||
|
right = r;
|
||||||
|
key = k;
|
||||||
|
freq = f;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// reduces file size
|
||||||
|
void minifyFile(string filePath, string modifiedSourceFilePath);
|
||||||
|
|
||||||
|
// tree of ascii chars, used as the codes references
|
||||||
|
Node * buildCodeTree(string alphabetSourceFilePath);
|
||||||
|
|
||||||
|
// linesNum tells how many binary lines to be converted
|
||||||
|
int compresstoBinaryFile(string modifiedSourceFilePath, Node * alphabetRoot);
|
||||||
|
|
||||||
|
// decompresses the binary file into text file
|
||||||
|
void decompressBinaryFile(string binarySourceFilePath, string textDistnationFilePath,int linesNum, Node * alphabetRoot);
|
||||||
|
|
||||||
|
#endif // ALGORITHMS_CPLUSPLUS_TREES_HUFFMANCODE_HUFFMAN_H_
|
|
@ -0,0 +1,27 @@
|
||||||
|
#include "./huffman.h"
|
||||||
|
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
|
||||||
|
// example
|
||||||
|
string sourceFilePath = "results/data-sample.txt";
|
||||||
|
|
||||||
|
string modifiedSourceFilePath = "results/modifiedSource.txt";
|
||||||
|
|
||||||
|
string binaryResultFilePath = "results/BinaryCoded.dat"; // the compressed file
|
||||||
|
string binaryDecodedFilePath = "results/decodedBinary.txt"; // after decompression
|
||||||
|
|
||||||
|
// reduces file size
|
||||||
|
minifyFile(sourceFilePath, modifiedSourceFilePath);
|
||||||
|
|
||||||
|
// tree of ascii chars, used as the codes references
|
||||||
|
Node * alphabetRoot = buildCodeTree(modifiedSourceFilePath);
|
||||||
|
|
||||||
|
// linesNum tells how many binary lines to be converted
|
||||||
|
int linesNum = compresstoBinaryFile(modifiedSourceFilePath, alphabetRoot);
|
||||||
|
|
||||||
|
// decompresses the binary file into text file
|
||||||
|
decompressBinaryFile(binaryResultFilePath, binaryDecodedFilePath, linesNum, alphabetRoot);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,2 @@
|
||||||
|
lorem text:
|
||||||
|
"Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?"
|
Loading…
Reference in New Issue