chore(CPlusPlus): added Huffman (#468)

Signed-off-by: Kareim Tarek AbdelAzeem <kareimgazar1990@gmail.com>

Co-authored-by: Ankur <54987647+Arsenic-ATG@users.noreply.github.com>
pull/472/head
Kareim Tarek AbdelAzeem 2021-09-20 13:45:11 +02:00 committed by GitHub
parent 2def163eae
commit b330bdb75d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 374 additions and 0 deletions

View File

@ -0,0 +1,23 @@
# Huffman Code
## Goal
The goal is minimize the size of files using the fact that some characters are redundant and with higher frequency than others like the word the, so the goal is to use less chars to type the most common words and more chars for the least common words.
## Get Started
In the `main.cpp` file you can find a running example and the results can be seen in details phase by phase in the results folder
## The trick
The problem with this Huffman codes is that it uses the Concept of **make the common case faster** regrading the frequency of words in the file which is a variable of the file itself.
So for better performance the program records the frequency of each word in the file before processing, providing the algorithm with the best prediction.
## performance
This algorithms here decrease the files size to half on average.
personally, I used it in this program to minimize my xml and json files.
## Application
Mainly this algorithm is used to decrease files size from text to audio.

View File

@ -0,0 +1,284 @@
// this will requrire one to change the the source directory in the build system
#include "algorithms/CPlusPlus/Trees/huffmanCode/huffman.h"
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <sstream>
#include <iterator>
#include <algorithm>
#include <map>
#include <bitset>
#include <utility>
#include <regex> // NOLINT (build/c++11)
using namespace std;
map<char, int> frequency;
class Node{
public:
Node * parent;
Node * left;
Node * right;
char key;
int freq;
string name;
Node(Node * p, Node * l, Node * r, int k, int f)
{
parent = p;
left = l;
right = r;
key = k;
freq = f;
}
};
void postOrder(Node * root)
{
if(root->left != nullptr) postOrder(root->left);
//cout << root->key << " ";
if(root->right != nullptr) postOrder(root->right);
if(root->left == nullptr && root->right == nullptr) cout << root->key << " ";
}
bool cmp(pair<char, int>& a,
pair<char, int>& b)
{
return a.second < b.second;
}
bool compare(Node * a, Node * b)
{
return a->freq < b->freq;
}
// Function to sort the map according
// to value in a (key-value) pairs
vector<pair<char, int> > sortChars(map<char, int>& M)
{
vector<pair<char, int> > result;
for (auto& it : M) {
result.push_back(it);
}
sort(result.begin(), result.end(), cmp);
return result;
}
void updateMap(string textLine)
{
for(char c : textLine)
{
frequency[c] +=1;
}
}
vector<pair<char, int> > getAlphabet(string alphabetSource)
{
string text;
ifstream MyReadFile(alphabetSource);
while (getline (MyReadFile, text)) {
//frequency['\n'] +=1;
updateMap(text);
}
MyReadFile.close();
vector<pair<char, int> > results = sortChars(frequency);
return results;
}
Node * getBinary(Node * root, char key, string & binaryCode)
{
if(root->key==key) return root;
else
{
if(root->left->name.find(key) != string::npos)
{
//cout << "0";
binaryCode += "0";
return getBinary(root->left, key, binaryCode);
}
else
{
//cout << "1";
binaryCode += "1";
return getBinary(root->right, key, binaryCode);
}
}
}
string extract(Node * root, string text)
{
Node * head = root;
string result = "";
for(int i=0; i<text.size(); i++)
{
if(text[i]=='0') head = head->left;
else head = head->right;
if(head->left== nullptr && head->right== nullptr)
{
result.push_back(head->key);
head = root;
}
}
return result;
}
string code(Node * root, string text)
{
string result = "";
for(int i=0; i<text.size(); i++)
{
string binaryCode = "";
getBinary(root, text[i], binaryCode);
result += binaryCode;
}
return result;
}
int codedBinay(string binaryText, string binaryResultFilePath)
{
int intNumber = 0;
ofstream binaryResultFile(binaryResultFilePath, ios::out | ios::binary);
ofstream integersFile("results/integers.txt"); // for debugging
int base = 0;
int offset = 31; // should be 31
string target = "";
while(base+offset<binaryText.size())
{
target = binaryText.substr(base, offset);
integersFile << target << endl;
int outInteger = stoi(target, nullptr, 2);
integersFile << outInteger << endl;
intNumber++;
binaryResultFile.write( (char*)&outInteger, sizeof(int));
base += offset;
}
if(base != binaryText.size()-1)
{
target = binaryText.substr(base, binaryText.size()-base);
integersFile << target << endl;
int outInteger = stoi(target, 0, 2);
integersFile << outInteger << endl;
intNumber++;
binaryResultFile.write( (char*)&outInteger, sizeof(int));
}
binaryResultFile.close();
return intNumber;
}
void minifyFile(string filePath, string modifiedSourceFilePath)
{
string text = "";
string textLine = "";
ifstream MyReadFile(filePath);
regex r("\\s{2,}");
while (getline(MyReadFile, textLine)) {
text += regex_replace(textLine, r, " ");
}
MyReadFile.close();
ofstream outFile(modifiedSourceFilePath);
outFile << text;
outFile.close();
}
Node * buildCodeTree(string alphabetSourceFilePath)
{
vector<pair<char, int>> results = getAlphabet(alphabetSourceFilePath);
vector<Node *> nodes;
for(auto result : results)
{
Node * node = new Node(nullptr, nullptr, nullptr, result.first, result.second);
node->name = node->key;
nodes.push_back(node);
}
while(nodes.size()>1)
{
Node * right = nodes[0];
Node * left = nodes[1];
nodes.erase(nodes.begin());
nodes.erase(nodes.begin());
Node * parent = new Node(nullptr, left, right, '\0', left->freq+right->freq);
parent->name += left->name + right->name;
nodes.push_back(parent);
sort(nodes.begin(), nodes.end(), compare);
}
return nodes[0];
}
void writeCodedText(string modifiedSourceFilePath, string textCodeFilePath, Node * root)
{
string sourceFileLine;
ifstream sourceFile(modifiedSourceFilePath);
ofstream distenationFile(textCodeFilePath);
while (getline (sourceFile, sourceFileLine)) {
// Output the text from the file
string textLine = sourceFileLine ; // not adding "\n" leaves only white spaces and results in one string
string codedText = code(root, textLine);
distenationFile << codedText;
}
sourceFile.close();
distenationFile.close();
}
void extractCodedText(string textCodeFilePath, string decodedFilePath, Node * root)
{
string textCodeFileLine;
ifstream textCodeFile(textCodeFilePath);
ofstream decodedFile(decodedFilePath);
while (getline (textCodeFile, textCodeFileLine)) {
string decodedText = extract(root, textCodeFileLine);
decodedFile << decodedText; // this i think mostly appends no new lines
}
textCodeFile.close();
decodedFile.close();
}
int codeBinaryFromText(string textCodeFilePath, string binaryResultFilePath)
{
string sourceTextCodedFileLine;
ifstream sourceTextCodedFile(textCodeFilePath);
int linesNum = 0;
while (getline(sourceTextCodedFile, sourceTextCodedFileLine)) {
linesNum = codedBinay(sourceTextCodedFileLine, binaryResultFilePath);
}
sourceTextCodedFile.close();
return linesNum;
}
void decompressBinaryFile(string binarySourceFilePath, string textDistnationFilePath,int linesNum, Node * alphabetRoot)
{
ifstream binaryResultFile(binarySourceFilePath, ios::out | ios::binary);
ofstream decodedbinaryFile(textDistnationFilePath);
int codedNumber;
string allText = "";
for(int i=0; i<linesNum; i++)
{
binaryResultFile.read((char *) &codedNumber, sizeof(int));
string str = bitset<31>(codedNumber).to_string();
allText += str;
}
string decodedText = extract(alphabetRoot, allText);
decodedbinaryFile << decodedText; // this i think mostly appends no new lines
binaryResultFile.close();
decodedbinaryFile.close();
}
int compresstoBinaryFile(string modifiedSourceFilePath, Node * alphabetRoot)
{
string textCodeFilePath = "results/textCoded.txt";
string decodedTextFilePath = "results/decodedText.txt";
string binaryResultFilePath = "results/BinaryCoded.dat";
writeCodedText(modifiedSourceFilePath, textCodeFilePath, alphabetRoot);
extractCodedText(textCodeFilePath, decodedTextFilePath, alphabetRoot);
int linesNum = codeBinaryFromText(textCodeFilePath, binaryResultFilePath);
return linesNum;
}

View File

@ -0,0 +1,38 @@
#ifndef ALGORITHMS_CPLUSPLUS_TREES_HUFFMANCODE_HUFFMAN_H_
#define ALGORITHMS_CPLUSPLUS_TREES_HUFFMANCODE_HUFFMAN_H_
#include <string>
using namespace std;
class Node{
public:
Node * parent;
Node * left;
Node * right;
char key;
int freq;
string name;
Node(Node * p, Node * l, Node * r, int k, int f)
{
parent = p;
left = l;
right = r;
key = k;
freq = f;
}
};
// reduces file size
void minifyFile(string filePath, string modifiedSourceFilePath);
// tree of ascii chars, used as the codes references
Node * buildCodeTree(string alphabetSourceFilePath);
// linesNum tells how many binary lines to be converted
int compresstoBinaryFile(string modifiedSourceFilePath, Node * alphabetRoot);
// decompresses the binary file into text file
void decompressBinaryFile(string binarySourceFilePath, string textDistnationFilePath,int linesNum, Node * alphabetRoot);
#endif // ALGORITHMS_CPLUSPLUS_TREES_HUFFMANCODE_HUFFMAN_H_

View File

@ -0,0 +1,27 @@
#include "./huffman.h"
int main() {
// example
string sourceFilePath = "results/data-sample.txt";
string modifiedSourceFilePath = "results/modifiedSource.txt";
string binaryResultFilePath = "results/BinaryCoded.dat"; // the compressed file
string binaryDecodedFilePath = "results/decodedBinary.txt"; // after decompression
// reduces file size
minifyFile(sourceFilePath, modifiedSourceFilePath);
// tree of ascii chars, used as the codes references
Node * alphabetRoot = buildCodeTree(modifiedSourceFilePath);
// linesNum tells how many binary lines to be converted
int linesNum = compresstoBinaryFile(modifiedSourceFilePath, alphabetRoot);
// decompresses the binary file into text file
decompressBinaryFile(binaryResultFilePath, binaryDecodedFilePath, linesNum, alphabetRoot);
return 0;
}

View File

@ -0,0 +1,2 @@
lorem text:
"Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?"