chore(CPlusPlus): added Huffman (#468)

Signed-off-by: Kareim Tarek AbdelAzeem <kareimgazar1990@gmail.com> Co-authored-by: Ankur <54987647+Arsenic-ATG@users.noreply.github.com>
2021-09-20 13:45:11 +02:00 · 2021-09-20 13:45:11 +02:00 · b330bdb75d
parent 2def163eae
commit b330bdb75d
5 changed files with 374 additions and 0 deletions
--- a/algorithms/CPlusPlus/Trees/huffmanCode/README.md
+++ b/algorithms/CPlusPlus/Trees/huffmanCode/README.md
@ -0,0 +1,23 @@
 # Huffman Code
 ## Goal
 The goal is minimize the size of files using the fact that some characters are redundant and with higher frequency than others like the word the, so the goal is to use less chars to type the most common words and more chars for the least common words.
 ## Get Started
 In the `main.cpp` file you can find a running example and the results can be seen in details phase by phase in the results folder
 ## The trick
 The problem with this Huffman codes is that it uses the Concept of **make the common case faster** regrading the frequency of words in the file which is a variable of the file itself.
 So for better performance the program records the frequency of each word in the file before processing, providing the algorithm with the best prediction.
 ## performance
 This algorithms here decrease the files size to half on average.
 personally, I used it in this program to minimize my xml and json files.
 ## Application
 Mainly this algorithm is used to decrease files size from text to audio.
--- a/algorithms/CPlusPlus/Trees/huffmanCode/huffman.cpp
+++ b/algorithms/CPlusPlus/Trees/huffmanCode/huffman.cpp
@ -0,0 +1,284 @@
 // this will requrire one to change the the source directory in the build system
 #include "algorithms/CPlusPlus/Trees/huffmanCode/huffman.h"
 #include <iostream>
 #include <vector>
 #include <string>
 #include <fstream>
 #include <sstream>
 #include <iterator>
 #include <algorithm>
 #include <map>
 #include <bitset>
 #include <utility>
 #include <regex>    // NOLINT (build/c++11)
 using namespace std;
 map<char, int> frequency;
 class Node{
 public:
    Node * parent;
    Node * left;
    Node * right;
    char key;
    int freq;
    string name;
    Node(Node * p, Node * l, Node * r, int k, int f)
    {
        parent = p;
        left = l;
        right = r;
        key = k;
        freq = f;
    }
 };
 void postOrder(Node * root)
 {
    if(root->left != nullptr) postOrder(root->left);
    //cout << root->key << " ";
    if(root->right != nullptr) postOrder(root->right);
    if(root->left == nullptr && root->right == nullptr) cout << root->key << " ";
 }
 bool cmp(pair<char, int>& a,
         pair<char, int>& b)
 {
    return a.second < b.second;
 }
 bool compare(Node * a, Node * b)
 {
    return a->freq < b->freq;
 }
 // Function to sort the map according
 // to value in a (key-value) pairs
 vector<pair<char, int> > sortChars(map<char, int>& M)
 {
    vector<pair<char, int> > result;
    for (auto& it : M) {
        result.push_back(it);
    }
    sort(result.begin(), result.end(), cmp);
    return result;
 }
 void updateMap(string textLine)
 {
    for(char c : textLine)
    {
        frequency[c] +=1;
    }
 }
 vector<pair<char, int> > getAlphabet(string alphabetSource)
 {
    string text;
    ifstream MyReadFile(alphabetSource);
    while (getline (MyReadFile, text)) {
        //frequency['\n'] +=1;
        updateMap(text);
    }
    MyReadFile.close();
    vector<pair<char, int> > results = sortChars(frequency);
    return results;
 }
 Node * getBinary(Node * root, char key, string & binaryCode)
 {
    if(root->key==key) return root;
    else
    {
        if(root->left->name.find(key) != string::npos)
        {
            //cout << "0";
            binaryCode += "0";
            return getBinary(root->left, key, binaryCode);
        }
        else
        {
            //cout << "1";
            binaryCode += "1";
            return getBinary(root->right, key, binaryCode);
        }
    }
 }
 string extract(Node * root, string text)
 {
    Node * head = root;
    string result = "";
    for(int i=0; i<text.size(); i++)
    {
        if(text[i]=='0') head = head->left;
        else head = head->right;
        if(head->left== nullptr && head->right== nullptr)
        {
            result.push_back(head->key);
            head = root;
        }
    }
    return result;
 }
 string code(Node * root, string text)
 {
    string result = "";
    for(int i=0; i<text.size(); i++)
    {
        string binaryCode = "";
        getBinary(root, text[i], binaryCode);
        result += binaryCode;
    }
    return result;
 }
 int codedBinay(string binaryText, string binaryResultFilePath)
 {
    int intNumber = 0;
    ofstream binaryResultFile(binaryResultFilePath, ios::out | ios::binary);
    ofstream integersFile("results/integers.txt"); // for debugging
    int base = 0;
    int offset = 31; // should be 31
    string target = "";
    while(base+offset<binaryText.size())
    {
        target = binaryText.substr(base, offset);
        integersFile << target << endl;
        int outInteger = stoi(target, nullptr, 2);
        integersFile << outInteger << endl;
        intNumber++;
        binaryResultFile.write( (char*)&outInteger, sizeof(int));
        base += offset;
    }
    if(base != binaryText.size()-1)
    {
        target = binaryText.substr(base, binaryText.size()-base);
        integersFile << target << endl;
        int outInteger = stoi(target, 0, 2);
        integersFile << outInteger << endl;
        intNumber++;
        binaryResultFile.write( (char*)&outInteger, sizeof(int));
    }
    binaryResultFile.close();
    return intNumber;
 }
 void minifyFile(string filePath, string modifiedSourceFilePath)
 {
    string text = "";
    string textLine = "";
    ifstream MyReadFile(filePath);
    regex r("\\s{2,}");
    while (getline(MyReadFile, textLine)) {
        text += regex_replace(textLine, r, " ");
    }
    MyReadFile.close();
    ofstream outFile(modifiedSourceFilePath);
    outFile << text;
    outFile.close();
 }
 Node * buildCodeTree(string alphabetSourceFilePath)
 {
    vector<pair<char, int>> results = getAlphabet(alphabetSourceFilePath);
    vector<Node *> nodes;
    for(auto result : results)
    {
        Node * node = new Node(nullptr, nullptr, nullptr, result.first, result.second);
        node->name = node->key;
        nodes.push_back(node);
    }
    while(nodes.size()>1)
    {
        Node * right = nodes[0];
        Node * left = nodes[1];
        nodes.erase(nodes.begin());
        nodes.erase(nodes.begin());
        Node * parent = new Node(nullptr, left, right, '\0', left->freq+right->freq);
        parent->name += left->name + right->name;
        nodes.push_back(parent);
        sort(nodes.begin(), nodes.end(), compare);
    }
    return nodes[0];
 }
 void writeCodedText(string modifiedSourceFilePath, string textCodeFilePath, Node * root)
 {
    string sourceFileLine;
    ifstream sourceFile(modifiedSourceFilePath);
    ofstream distenationFile(textCodeFilePath);
    while (getline (sourceFile, sourceFileLine)) {
        // Output the text from the file
        string textLine = sourceFileLine ; // not adding "\n" leaves only white spaces and results in one string
        string codedText = code(root, textLine);
        distenationFile << codedText;
    }
    sourceFile.close();
    distenationFile.close();
 }
 void extractCodedText(string textCodeFilePath, string decodedFilePath, Node * root)
 {
    string textCodeFileLine;
    ifstream textCodeFile(textCodeFilePath);
    ofstream decodedFile(decodedFilePath);
    while (getline (textCodeFile, textCodeFileLine)) {
        string decodedText = extract(root, textCodeFileLine);
        decodedFile << decodedText; // this i think mostly appends no new lines
    }
    textCodeFile.close();
    decodedFile.close();
 }
 int codeBinaryFromText(string textCodeFilePath, string binaryResultFilePath)
 {
    string sourceTextCodedFileLine;
    ifstream sourceTextCodedFile(textCodeFilePath);
    int linesNum = 0;
    while (getline(sourceTextCodedFile, sourceTextCodedFileLine)) {
        linesNum = codedBinay(sourceTextCodedFileLine, binaryResultFilePath);
    }
    sourceTextCodedFile.close();
    return linesNum;
 }
 void decompressBinaryFile(string binarySourceFilePath, string textDistnationFilePath,int linesNum, Node * alphabetRoot)
 {
    ifstream binaryResultFile(binarySourceFilePath, ios::out | ios::binary);
    ofstream decodedbinaryFile(textDistnationFilePath);
    int codedNumber;
    string allText = "";
    for(int i=0; i<linesNum; i++)
    {
        binaryResultFile.read((char *) &codedNumber, sizeof(int));
        string str = bitset<31>(codedNumber).to_string();
        allText += str;
    }
    string decodedText = extract(alphabetRoot, allText);
    decodedbinaryFile << decodedText; // this i think mostly appends no new lines
    binaryResultFile.close();
    decodedbinaryFile.close();
 }
 int compresstoBinaryFile(string modifiedSourceFilePath, Node * alphabetRoot)
 {
    string textCodeFilePath = "results/textCoded.txt";
    string decodedTextFilePath = "results/decodedText.txt";
    string binaryResultFilePath = "results/BinaryCoded.dat";
    writeCodedText(modifiedSourceFilePath, textCodeFilePath, alphabetRoot);
    extractCodedText(textCodeFilePath, decodedTextFilePath, alphabetRoot);
    int linesNum = codeBinaryFromText(textCodeFilePath, binaryResultFilePath);
    return linesNum;
 }
--- a/algorithms/CPlusPlus/Trees/huffmanCode/huffman.h
+++ b/algorithms/CPlusPlus/Trees/huffmanCode/huffman.h
@ -0,0 +1,38 @@
 #ifndef ALGORITHMS_CPLUSPLUS_TREES_HUFFMANCODE_HUFFMAN_H_
 #define ALGORITHMS_CPLUSPLUS_TREES_HUFFMANCODE_HUFFMAN_H_
 #include <string>
 using namespace std;
 class Node{
 public:
    Node * parent;
    Node * left;
    Node * right;
    char key;
    int freq;
    string name;
    Node(Node * p, Node * l, Node * r, int k, int f)
    {
        parent = p;
        left = l;
        right = r;
        key = k;
        freq = f;
    }
 };
 // reduces file size
 void minifyFile(string filePath, string modifiedSourceFilePath);
 // tree of ascii chars, used as the codes references
 Node * buildCodeTree(string alphabetSourceFilePath);
 // linesNum tells how many binary lines to be converted
 int compresstoBinaryFile(string modifiedSourceFilePath, Node * alphabetRoot);
 // decompresses the binary file into text file
 void decompressBinaryFile(string binarySourceFilePath, string textDistnationFilePath,int linesNum, Node * alphabetRoot);
 #endif // ALGORITHMS_CPLUSPLUS_TREES_HUFFMANCODE_HUFFMAN_H_
--- a/algorithms/CPlusPlus/Trees/huffmanCode/main.cpp
+++ b/algorithms/CPlusPlus/Trees/huffmanCode/main.cpp
@ -0,0 +1,27 @@
 #include "./huffman.h"
 int main() {
    // example
    string sourceFilePath = "results/data-sample.txt";
    string modifiedSourceFilePath = "results/modifiedSource.txt";
    string binaryResultFilePath = "results/BinaryCoded.dat"; // the compressed file
    string binaryDecodedFilePath = "results/decodedBinary.txt"; // after decompression
    // reduces file size
    minifyFile(sourceFilePath, modifiedSourceFilePath);
    // tree of ascii chars, used as the codes references
    Node * alphabetRoot = buildCodeTree(modifiedSourceFilePath);
    // linesNum tells how many binary lines to be converted
    int linesNum = compresstoBinaryFile(modifiedSourceFilePath, alphabetRoot);
    // decompresses the binary file into text file
    decompressBinaryFile(binaryResultFilePath, binaryDecodedFilePath, linesNum, alphabetRoot);
    return 0;
 }
--- a/algorithms/CPlusPlus/Trees/huffmanCode/results/data-sample.txt
+++ b/algorithms/CPlusPlus/Trees/huffmanCode/results/data-sample.txt
@ -0,0 +1,2 @@
 lorem text:
 "Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?"
		`@ -0,0 +1,2 @@`
							`lorem text:`
							"Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?"