这几天在较为认真的研究基于哈夫曼编码的文件压缩及解压,费了点时间,在这分享一下:
这里用链式结构,非顺序表结构;
文件压缩:
1.获取文件信息(这里采用TXT格式文本);
2.压缩文件;
3.写配置文件(便于解压时用,无非就是存放原文件的索引之类的,比如说,文件中某个字符出现的个数,记录下来)
4.解压缩,使用压缩后的文件和配置文件解压文件;
5.用比对软件,比对解压后的文件和源文件是否相同;
下面慢慢解析:
先看一个文件信息类:
typedef long long LongType; struct FileInfo { unsigned char _ch; //字符 LongType _count; //字符出现次数 string _code; //字符对应的哈夫曼编码 FileInfo(unsigned char ch = 0) :_ch(ch) ,_count(0) {} FileInfo operator+(const FileInfo& x) { FileInfo tmp; tmp._count = this->_count + x._count; return tmp; } bool operator !=(const FileInfo& x) const { return this->_count != x._count; } }; bool operator<(const FileInfo info1,const FileInfo info2) { return info1._count < info2._count; }此为一个文件信息的类结构,包含字符,字符对应出现的次数,以及这个字符对应的哈夫曼编码(能看到这篇博客的星弟,对哈夫曼编码不会陌生,这里不再强调)
除了统计字符出现的次数及哈夫曼编码,还完成了几个运算符的重载
要获取哈夫曼编码,就得建立哈夫曼树,建立哈夫曼树用最小堆取操作,以下是最小堆建立过程
// 小堆 template<class T> struct Less { bool operator() (const T& l, const T& r) { return l < r; // operator< } }; template<class T> struct Greater { bool operator() (const T& l, const T& r) { return l > r; // operator< } }; template<class T, class Compare = Less<T>> class Heap { public: Heap() {} Heap(const T* a, size_t size) { for (size_t i = 0; i < size; ++i) { _arrays.push_back(a[i]); } // 建堆 for(int i = (_arrays.size()-2)/2; i >= 0; --i) { AdjustDown(i); } } void Push(const T& x) { _arrays.push_back(x); AdjustUp(_arrays.size()-1); } void Pop() { assert(_arrays.size() > 0); swap(_arrays[0], _arrays[_arrays.size() - 1]); _arrays.pop_back(); AdjustDown(0); } T& Top() { assert(_arrays.size() > 0); return _arrays[0]; } bool Empty() { return _arrays.empty(); } int Size() { return _arrays.size(); } void AdjustDown(int root) { int child = root*2 + 1; // Compare com; while (child < _arrays.size()) { // 比较出左右孩子中小的那个 if (child+1<_arrays.size() && *_arrays[child+1] < _arrays[child]) //if(child+1<_arrays.size() && // com(_arrays[child+1],_arrays[child])) { ++child; } if(*_arrays[child] < _arrays[root]) //if(com(_arrays[child],_arrays[root])) { swap(_arrays[child], _arrays[root]); root = child; child = 2*root+1; } else { break; } } } void AdjustUp(int child) { int parent = (child-1)/2; //while (parent >= 0) while (child > 0) { if (*_arrays[child] < _arrays[parent]) { swap(_arrays[parent], _arrays[child]); child = parent; parent = (child-1)/2; } else { break; } } } public: vector<T> _arrays; };最小堆里也完成了很多接口,包括push pop等
然后就是几个压缩和解压的函数接口
1.根据哈夫曼树获取哈夫曼变慢:
void _GenerateHuffmanCode(HuffmanTreeNode<FileInfo>* root) { if (root == nullptr) { return; } _GenerateHuffmanCode(root->_left); _GenerateHuffmanCode(root->_right); //当前节点为叶子节点为空 才生成哈夫曼编码 if (root->_left == nullptr && root->_right == nullptr) { HuffmanTreeNode<FileInfo>* cur = root; HuffmanTreeNode<FileInfo>* parent = cur->_parent; string& code = _infos[cur->_weight._ch]._code; while (parent) { if (parent->_left == cur) { code += '1'; } else if (parent->_right == cur) { code += '0'; } cur = parent; parent = cur->_parent; } reverse(code.begin(), code.end()); } }
void CreateTree(T *a, size_t size, const T& invalid) { assert(a); Heap<HuffmanTreeNode<T>*> s1; //草 终于发现问题 在这里 (堆里放的是指针,类型一定要对) //找两个最小的元素 for (size_t i = 0; i < size; ++i) { if (a[i] != invalid) { HuffmanTreeNode<T>* node = new HuffmanTreeNode<T>(a[i]); s1.Push(node); } } while (s1.Size() > 1) { HuffmanTreeNode<T>* left = s1.Top(); s1.Pop(); HuffmanTreeNode<T>* right = s1.Top(); s1.Pop(); HuffmanTreeNode<T>* parent = new HuffmanTreeNode<T>(left->_weight + right->_weight); parent->_left = left; parent->_right = right; left->_parent = parent; right->_parent = parent; s1.Push(parent); } _root = s1.Top(); s1.Pop(); }
bool _ReadLine(FILE *fOutLogFile, string& line) { char ch = fgetc(fOutLogFile); if (feof(fOutLogFile)) return false; else { if (ch == ' ') { line += ch; ch = fgetc(fOutLogFile); } while (ch != ' ') { line += ch; ch = fgetc(fOutLogFile); } return true; } }
4.文件压缩
//文件压缩 bool Compress(const char* filename) { //1.打开一个文件,统计文件字符出现的次数 //2.生成对应的哈弗曼编码 //3.压缩文件 //4.写配置文件,方便解压缩 assert(filename); FILE *fOut = fopen(filename, "rb"); assert(fOut); //统计文件字符出现的次数 unsigned char ch = fgetc(fOut); while (!feof(fOut)) //文件结束 { _infos[ch]._count++; ch = fgetc(fOut); } HuffmanTree<FileInfo> ht; FileInfo invalid; ht.CreateTree(_infos, 256, invalid); //哈夫曼编码 _GenerateHuffmanCode(ht.GetRoot()); string compressFile = filename; compressFile += ".huf"; //压缩后的文件名 后缀为《输入文件名+.huf》 FILE *finCompress = fopen(compressFile.c_str(), "wb"); //获取string中的C字符串 assert(finCompress); fseek(fOut, 0, SEEK_SET);//将文件指针移到开头 char cha = fgetc(fOut); unsigned char inch = 0; int index = 0; //一个字节的八位 while (!feof(fOut)) { string& code = _infos[(unsigned char)cha]._code; for (size_t i = 0; i < code.size(); ++i) { inch <<= 1; //低位向高位进 if (code[i] == '1') { inch |= 1; } if (++index == 8) { fputc(inch, finCompress); //够8位,装进文件 index = 0; //重新一轮开始 inch = 0; } } cha = fgetc(fOut); } fclose(fOut); //如果index = 0 说明 上边8位刚好存满 不等 下一个自己又出来了 if (index != 0) //处理最后一个字符不够的问题 { inch <<= (8 - index); //最高位必须装上 后边的浪费掉 fputc(inch, finCompress); } fclose(finCompress); }
5.写配置文件:
string logFile = filename; logFile += ".log"; FILE *Log = fopen(logFile.c_str(), "wb"); assert(Log); string chInfo; char str[128] = {0}; //没空间 不可以 for (size_t i = 1; i < 256; ++i) { if (_infos[i]._count > 0) { chInfo += _infos[i]._ch; chInfo += ','; chInfo += _itoa(_infos[i]._count,str,10); chInfo += ' '; fputs(chInfo.c_str(), Log); chInfo.clear(); } } fclose(Log);
6.最后的文件解压:
//重构文件 void _RestoreFiles(HuffmanTreeNode<FileInfo> *root, const char* Fileneme,long long size) { assert(root); //原压缩文件 string name = Fileneme; name += ".huf"; FILE* Out = fopen(name.c_str(),"rb"); assert(Out); string restorefilename = Fileneme; restorefilename += ".over"; FILE *over = fopen(restorefilename.c_str(),"wb"); assert(over); int pos = 8; long long poss = size; unsigned char chz = fgetc(Out); while (poss>0) { HuffmanTreeNode<FileInfo>* cur = nullptr; cur = root; while (cur->_left != nullptr || cur->_right != nullptr) { pos--; unsigned char temp = chz >> pos; int ch = 1 & temp; if (ch == 0) { cur = cur->_right; } else if (ch == 1) { cur = cur->_left; } if (pos == 0) { chz = fgetc(Out); pos = 8; } } fputc(cur->_weight._ch, over); poss--; } fclose(Out); fclose(over); } void UnCompress(const char* Fileneme)//解压缩 { //1.打开日志文件 //2.根据信息还原哈夫曼树 //3.还原信息; string UnCompressneme = Fileneme; UnCompressneme += ".log"; FILE *fOutLogFile = fopen(UnCompressneme.c_str(), "rb"); assert(fOutLogFile); string line; while (_ReadLine(fOutLogFile, line)) { unsigned char ch = line[0]; _infos[ch]._count = atoi(line.substr(2).c_str()); line.clear(); } HuffmanTree<FileInfo> f; FileInfo invalid; f.CreateTree(_infos, 256, invalid); //根据重建的哈夫曼树 还原文件; long long size = f.GetRoot()->_weight._count; _RestoreFiles(f.GetRoot(), Fileneme,size); }到此,此项目基本完成;如遇问题,希望留言,随时解答,如有见解,跪求赐教!