From b330bdb75dc933bc9bb9bb042a6ebcf333c74149 Mon Sep 17 00:00:00 2001 From: Kareim Tarek AbdelAzeem <49312818+KareimGazer@users.noreply.github.com> Date: Mon, 20 Sep 2021 13:45:11 +0200 Subject: [PATCH] chore(CPlusPlus): added Huffman (#468) Signed-off-by: Kareim Tarek AbdelAzeem Co-authored-by: Ankur <54987647+Arsenic-ATG@users.noreply.github.com> --- .../CPlusPlus/Trees/huffmanCode/README.md | 23 ++ .../CPlusPlus/Trees/huffmanCode/huffman.cpp | 284 ++++++++++++++++++ .../CPlusPlus/Trees/huffmanCode/huffman.h | 38 +++ .../CPlusPlus/Trees/huffmanCode/main.cpp | 27 ++ .../Trees/huffmanCode/results/data-sample.txt | 2 + 5 files changed, 374 insertions(+) create mode 100644 algorithms/CPlusPlus/Trees/huffmanCode/README.md create mode 100644 algorithms/CPlusPlus/Trees/huffmanCode/huffman.cpp create mode 100644 algorithms/CPlusPlus/Trees/huffmanCode/huffman.h create mode 100644 algorithms/CPlusPlus/Trees/huffmanCode/main.cpp create mode 100644 algorithms/CPlusPlus/Trees/huffmanCode/results/data-sample.txt diff --git a/algorithms/CPlusPlus/Trees/huffmanCode/README.md b/algorithms/CPlusPlus/Trees/huffmanCode/README.md new file mode 100644 index 00000000..30276e55 --- /dev/null +++ b/algorithms/CPlusPlus/Trees/huffmanCode/README.md @@ -0,0 +1,23 @@ +# Huffman Code + +## Goal + +The goal is minimize the size of files using the fact that some characters are redundant and with higher frequency than others like the word the, so the goal is to use less chars to type the most common words and more chars for the least common words. + +## Get Started + +In the `main.cpp` file you can find a running example and the results can be seen in details phase by phase in the results folder + +## The trick + +The problem with this Huffman codes is that it uses the Concept of **make the common case faster** regrading the frequency of words in the file which is a variable of the file itself. +So for better performance the program records the frequency of each word in the file before processing, providing the algorithm with the best prediction. + +## performance + +This algorithms here decrease the files size to half on average. +personally, I used it in this program to minimize my xml and json files. + +## Application + +Mainly this algorithm is used to decrease files size from text to audio. diff --git a/algorithms/CPlusPlus/Trees/huffmanCode/huffman.cpp b/algorithms/CPlusPlus/Trees/huffmanCode/huffman.cpp new file mode 100644 index 00000000..0006231a --- /dev/null +++ b/algorithms/CPlusPlus/Trees/huffmanCode/huffman.cpp @@ -0,0 +1,284 @@ +// this will requrire one to change the the source directory in the build system +#include "algorithms/CPlusPlus/Trees/huffmanCode/huffman.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // NOLINT (build/c++11) + +using namespace std; + +map frequency; +class Node{ +public: + Node * parent; + Node * left; + Node * right; + char key; + int freq; + string name; + Node(Node * p, Node * l, Node * r, int k, int f) + { + parent = p; + left = l; + right = r; + key = k; + freq = f; + } +}; + +void postOrder(Node * root) +{ + if(root->left != nullptr) postOrder(root->left); + //cout << root->key << " "; + if(root->right != nullptr) postOrder(root->right); + if(root->left == nullptr && root->right == nullptr) cout << root->key << " "; +} + +bool cmp(pair& a, + pair& b) +{ + return a.second < b.second; +} + +bool compare(Node * a, Node * b) +{ + return a->freq < b->freq; +} + +// Function to sort the map according +// to value in a (key-value) pairs +vector > sortChars(map& M) +{ + vector > result; + for (auto& it : M) { + result.push_back(it); + } + sort(result.begin(), result.end(), cmp); + return result; +} + +void updateMap(string textLine) +{ + for(char c : textLine) + { + frequency[c] +=1; + } +} + +vector > getAlphabet(string alphabetSource) +{ + string text; + ifstream MyReadFile(alphabetSource); + while (getline (MyReadFile, text)) { + //frequency['\n'] +=1; + updateMap(text); + } + MyReadFile.close(); + vector > results = sortChars(frequency); + return results; +} + +Node * getBinary(Node * root, char key, string & binaryCode) +{ + if(root->key==key) return root; + else + { + if(root->left->name.find(key) != string::npos) + { + //cout << "0"; + binaryCode += "0"; + return getBinary(root->left, key, binaryCode); + } + else + { + //cout << "1"; + binaryCode += "1"; + return getBinary(root->right, key, binaryCode); + } + } +} + +string extract(Node * root, string text) +{ + Node * head = root; + string result = ""; + for(int i=0; ileft; + else head = head->right; + + if(head->left== nullptr && head->right== nullptr) + { + result.push_back(head->key); + head = root; + } + } + return result; +} + +string code(Node * root, string text) +{ + string result = ""; + + for(int i=0; i> results = getAlphabet(alphabetSourceFilePath); + vector nodes; + for(auto result : results) + { + Node * node = new Node(nullptr, nullptr, nullptr, result.first, result.second); + node->name = node->key; + nodes.push_back(node); + } + + while(nodes.size()>1) + { + Node * right = nodes[0]; + Node * left = nodes[1]; + nodes.erase(nodes.begin()); + nodes.erase(nodes.begin()); + Node * parent = new Node(nullptr, left, right, '\0', left->freq+right->freq); + parent->name += left->name + right->name; + nodes.push_back(parent); + sort(nodes.begin(), nodes.end(), compare); + } + return nodes[0]; +} + +void writeCodedText(string modifiedSourceFilePath, string textCodeFilePath, Node * root) +{ + string sourceFileLine; + ifstream sourceFile(modifiedSourceFilePath); + ofstream distenationFile(textCodeFilePath); + + while (getline (sourceFile, sourceFileLine)) { + // Output the text from the file + string textLine = sourceFileLine ; // not adding "\n" leaves only white spaces and results in one string + string codedText = code(root, textLine); + distenationFile << codedText; + } + sourceFile.close(); + distenationFile.close(); +} + +void extractCodedText(string textCodeFilePath, string decodedFilePath, Node * root) +{ + string textCodeFileLine; + ifstream textCodeFile(textCodeFilePath); + ofstream decodedFile(decodedFilePath); + + while (getline (textCodeFile, textCodeFileLine)) { + string decodedText = extract(root, textCodeFileLine); + decodedFile << decodedText; // this i think mostly appends no new lines + } + textCodeFile.close(); + decodedFile.close(); +} + +int codeBinaryFromText(string textCodeFilePath, string binaryResultFilePath) +{ + string sourceTextCodedFileLine; + ifstream sourceTextCodedFile(textCodeFilePath); + int linesNum = 0; + while (getline(sourceTextCodedFile, sourceTextCodedFileLine)) { + linesNum = codedBinay(sourceTextCodedFileLine, binaryResultFilePath); + } + sourceTextCodedFile.close(); + return linesNum; +} + +void decompressBinaryFile(string binarySourceFilePath, string textDistnationFilePath,int linesNum, Node * alphabetRoot) +{ + ifstream binaryResultFile(binarySourceFilePath, ios::out | ios::binary); + ofstream decodedbinaryFile(textDistnationFilePath); + int codedNumber; + string allText = ""; + for(int i=0; i(codedNumber).to_string(); + allText += str; + } + string decodedText = extract(alphabetRoot, allText); + decodedbinaryFile << decodedText; // this i think mostly appends no new lines + binaryResultFile.close(); + decodedbinaryFile.close(); +} + +int compresstoBinaryFile(string modifiedSourceFilePath, Node * alphabetRoot) +{ + string textCodeFilePath = "results/textCoded.txt"; + string decodedTextFilePath = "results/decodedText.txt"; + string binaryResultFilePath = "results/BinaryCoded.dat"; + + writeCodedText(modifiedSourceFilePath, textCodeFilePath, alphabetRoot); + extractCodedText(textCodeFilePath, decodedTextFilePath, alphabetRoot); + int linesNum = codeBinaryFromText(textCodeFilePath, binaryResultFilePath); + return linesNum; +} diff --git a/algorithms/CPlusPlus/Trees/huffmanCode/huffman.h b/algorithms/CPlusPlus/Trees/huffmanCode/huffman.h new file mode 100644 index 00000000..efbba6dc --- /dev/null +++ b/algorithms/CPlusPlus/Trees/huffmanCode/huffman.h @@ -0,0 +1,38 @@ +#ifndef ALGORITHMS_CPLUSPLUS_TREES_HUFFMANCODE_HUFFMAN_H_ +#define ALGORITHMS_CPLUSPLUS_TREES_HUFFMANCODE_HUFFMAN_H_ + +#include + +using namespace std; + +class Node{ +public: + Node * parent; + Node * left; + Node * right; + char key; + int freq; + string name; + Node(Node * p, Node * l, Node * r, int k, int f) + { + parent = p; + left = l; + right = r; + key = k; + freq = f; + } +}; + +// reduces file size +void minifyFile(string filePath, string modifiedSourceFilePath); + +// tree of ascii chars, used as the codes references +Node * buildCodeTree(string alphabetSourceFilePath); + +// linesNum tells how many binary lines to be converted +int compresstoBinaryFile(string modifiedSourceFilePath, Node * alphabetRoot); + +// decompresses the binary file into text file +void decompressBinaryFile(string binarySourceFilePath, string textDistnationFilePath,int linesNum, Node * alphabetRoot); + +#endif // ALGORITHMS_CPLUSPLUS_TREES_HUFFMANCODE_HUFFMAN_H_ diff --git a/algorithms/CPlusPlus/Trees/huffmanCode/main.cpp b/algorithms/CPlusPlus/Trees/huffmanCode/main.cpp new file mode 100644 index 00000000..671e7cd6 --- /dev/null +++ b/algorithms/CPlusPlus/Trees/huffmanCode/main.cpp @@ -0,0 +1,27 @@ +#include "./huffman.h" + + +int main() { + + // example + string sourceFilePath = "results/data-sample.txt"; + + string modifiedSourceFilePath = "results/modifiedSource.txt"; + + string binaryResultFilePath = "results/BinaryCoded.dat"; // the compressed file + string binaryDecodedFilePath = "results/decodedBinary.txt"; // after decompression + + // reduces file size + minifyFile(sourceFilePath, modifiedSourceFilePath); + + // tree of ascii chars, used as the codes references + Node * alphabetRoot = buildCodeTree(modifiedSourceFilePath); + + // linesNum tells how many binary lines to be converted + int linesNum = compresstoBinaryFile(modifiedSourceFilePath, alphabetRoot); + + // decompresses the binary file into text file + decompressBinaryFile(binaryResultFilePath, binaryDecodedFilePath, linesNum, alphabetRoot); + + return 0; +} diff --git a/algorithms/CPlusPlus/Trees/huffmanCode/results/data-sample.txt b/algorithms/CPlusPlus/Trees/huffmanCode/results/data-sample.txt new file mode 100644 index 00000000..4182dcc5 --- /dev/null +++ b/algorithms/CPlusPlus/Trees/huffmanCode/results/data-sample.txt @@ -0,0 +1,2 @@ +lorem text: +"Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?" \ No newline at end of file