zoukankan      html  css  js  c++  java
  • 基于哈夫曼编码完成的文件压缩及解压

    这几天在较为认真的研究基于哈夫曼编码的文件压缩及解压,费了点时间,在这分享一下:


    这里用链式结构,非顺序表结构;


    文件压缩:

    1.获取文件信息(这里采用TXT格式文本);

    2.压缩文件;

    3.写配置文件(便于解压时用,无非就是存放原文件的索引之类的,比如说,文件中某个字符出现的个数,记录下来)

    4.解压缩,使用压缩后的文件和配置文件解压文件;

    5.用比对软件,比对解压后的文件和源文件是否相同;


    下面慢慢解析:

    先看一个文件信息类:

    typedef long long LongType;
    struct FileInfo
    {
    	unsigned char _ch;       //字符
    	LongType _count;         //字符出现次数
    	string _code;            //字符对应的哈夫曼编码 
    
    	FileInfo(unsigned char ch = 0)
    		:_ch(ch)
    		,_count(0)
    	{}
    
    
    	FileInfo operator+(const FileInfo& x)
    	{
    		FileInfo tmp;
    		tmp._count = this->_count + x._count;
    		return tmp;
    	}
    
    	bool operator !=(const FileInfo& x) const
    	{
    		return this->_count != x._count;
    	}
    };
    
    bool operator<(const FileInfo info1,const FileInfo info2)
    {
    	return info1._count < info2._count;
    }
    此为一个文件信息的类结构,包含字符,字符对应出现的次数,以及这个字符对应的哈夫曼编码(能看到这篇博客的星弟,对哈夫曼编码不会陌生,这里不再强调)

    除了统计字符出现的次数及哈夫曼编码,还完成了几个运算符的重载


    要获取哈夫曼编码,就得建立哈夫曼树,建立哈夫曼树用最小堆取操作,以下是最小堆建立过程

    // 小堆
    template<class T>
    struct Less
    {
    	bool operator() (const T& l, const T& r)
    	{
    		return l < r; // operator<
    	}
    
    };
    
    template<class T>
    struct Greater
    {
    	bool operator() (const T& l, const T& r)
    	{
    		return l > r; // operator<
    	}
    };
    
    template<class T, class Compare = Less<T>>
    class Heap
    {
    public:
    	Heap()
    	{}
    
    	Heap(const T* a, size_t size)
    	{
    		for (size_t i = 0; i < size; ++i)
    		{
    			_arrays.push_back(a[i]);
    		}
    
    		// 建堆
    		for(int i = (_arrays.size()-2)/2; i >= 0; --i)
    		{
    			AdjustDown(i);
    		}
    	}
    
    	void Push(const T& x)
    	{
    		_arrays.push_back(x);
    		AdjustUp(_arrays.size()-1);
    	}
    
    	void Pop()
    	{
    		assert(_arrays.size() > 0);
    		swap(_arrays[0], _arrays[_arrays.size() - 1]);
    		_arrays.pop_back();
    
    		AdjustDown(0);
    	}
    
    	T& Top()
    	{
    		assert(_arrays.size() > 0);
    		return _arrays[0];
    	}
    
    	bool Empty()
    	{
    		return _arrays.empty();
    	}
    
    	int Size()
    	{
    		return _arrays.size();
    	}
    
    	void AdjustDown(int root)
    	{
    		int child = root*2 + 1;
    		// 	
    		Compare com;
    		while (child < _arrays.size())
    		{
    			// 比较出左右孩子中小的那个
    			if (child+1<_arrays.size() &&
    				*_arrays[child+1] < _arrays[child])
    			//if(child+1<_arrays.size() &&
    			//	com(_arrays[child+1],_arrays[child]))
    			{
    				++child;
    			}
    
    			if(*_arrays[child] < _arrays[root])
    			//if(com(_arrays[child],_arrays[root]))
    			{
    				swap(_arrays[child], _arrays[root]);
    				root = child;
    				child = 2*root+1;
    			}
    			else
    			{
    				break;
    			}
    		}
    	}
    
    	void AdjustUp(int child)
    	{
    		int parent = (child-1)/2;
    
    		//while (parent >= 0)
    		while (child > 0)
    		{
    			if (*_arrays[child] < _arrays[parent])
    			{
    				swap(_arrays[parent], _arrays[child]);
    				child = parent;
    				parent = (child-1)/2;
    			}
    			else
    			{
    				break;
    			}
    		}
    	}
    
    
    public:
    	vector<T> _arrays;
    };
    最小堆里也完成了很多接口,包括push  pop等

    然后就是几个压缩和解压的函数接口


    1.根据哈夫曼树获取哈夫曼变慢:

    	void _GenerateHuffmanCode(HuffmanTreeNode<FileInfo>* root)
    	{
    		if (root == nullptr)
    		{
    			return;
    		}
    
    		_GenerateHuffmanCode(root->_left);
    		_GenerateHuffmanCode(root->_right);
    
    		//当前节点为叶子节点为空  才生成哈夫曼编码
    		if (root->_left == nullptr && root->_right == nullptr)
    		{
    			HuffmanTreeNode<FileInfo>* cur = root;
    			HuffmanTreeNode<FileInfo>* parent = cur->_parent;
    			string& code = _infos[cur->_weight._ch]._code;
    
    			while (parent)
    			{
    				if (parent->_left == cur)
    				{
    					code += '1';
    				}
    				else if (parent->_right == cur)
    				{
    					code += '0';
    				}
    				cur = parent;
    				parent = cur->_parent;
    			}
    			reverse(code.begin(), code.end());
    		}
    	}


    2.根据最小堆建立哈夫曼树;

    void CreateTree(T *a, size_t size, const T& invalid)
    	{
    		assert(a);
    		Heap<HuffmanTreeNode<T>*> s1;  //草 终于发现问题  在这里   (堆里放的是指针,类型一定要对)
    
    		//找两个最小的元素
    		for (size_t i = 0; i < size; ++i)
    		{
    			if (a[i] != invalid)
    			{
    			HuffmanTreeNode<T>* node = new HuffmanTreeNode<T>(a[i]);
    				s1.Push(node);
    			}
    		}
    
    		while (s1.Size() > 1)
    		{
    			HuffmanTreeNode<T>* left = s1.Top();
    			s1.Pop();
    			HuffmanTreeNode<T>* right = s1.Top();
    			s1.Pop();
    
    
    			HuffmanTreeNode<T>* parent = new HuffmanTreeNode<T>(left->_weight + right->_weight);
    
    		
    			parent->_left = left;
    			parent->_right = right;
    
    			left->_parent = parent;
    			right->_parent = parent;
    
    			s1.Push(parent);
    		}
    		_root = s1.Top();
    		s1.Pop();
    	}


    3.读取文本文件中的一行:

    	bool _ReadLine(FILE *fOutLogFile, string& line)
    	{
    		char ch = fgetc(fOutLogFile);
    		if (feof(fOutLogFile))
    			return false;
    		else
    		{
    			if (ch == '
    ')
    			{
    				line += ch;
    				ch = fgetc(fOutLogFile);
    			}
    
    			while (ch != '
    ')
    			{
    				line += ch;
    				ch = fgetc(fOutLogFile);
    			}
    			return true;
    		}
    	}

    4.文件压缩

    	//文件压缩
    	bool Compress(const char* filename)
    	{
    		//1.打开一个文件,统计文件字符出现的次数
    		//2.生成对应的哈弗曼编码
    		//3.压缩文件
    		//4.写配置文件,方便解压缩
    
    		assert(filename);
    		FILE *fOut = fopen(filename, "rb");
    		assert(fOut);
    
    		//统计文件字符出现的次数
    		unsigned char ch = fgetc(fOut);
    		while (!feof(fOut))  //文件结束
    		{
    			_infos[ch]._count++;
    			ch = fgetc(fOut);
    		}
    
    		HuffmanTree<FileInfo> ht;
    		FileInfo invalid;
    		ht.CreateTree(_infos, 256, invalid);
    
    		//哈夫曼编码
    		_GenerateHuffmanCode(ht.GetRoot());
    
    		string compressFile = filename;
    		compressFile += ".huf";
    
    		//压缩后的文件名 后缀为《输入文件名+.huf》
    		FILE *finCompress = fopen(compressFile.c_str(), "wb"); //获取string中的C字符串
    		assert(finCompress);
    
    		fseek(fOut, 0, SEEK_SET);//将文件指针移到开头
    		char cha = fgetc(fOut);
    		unsigned char inch = 0;
    		int index = 0;  //一个字节的八位
    		while (!feof(fOut))
    		{
    			string& code = _infos[(unsigned char)cha]._code;
    
    			for (size_t i = 0; i < code.size(); ++i)
    			{
    				inch <<= 1;     //低位向高位进
    				if (code[i] == '1')
    				{
    					inch |= 1;
    				}
    
    				if (++index == 8)
    				{
    					fputc(inch, finCompress); //够8位,装进文件
    					index = 0;   //重新一轮开始
    					inch = 0;
    				}
    			}
    			cha = fgetc(fOut);
    		}
    
    		fclose(fOut);
    
    		//如果index = 0 说明 上边8位刚好存满 不等 下一个自己又出来了
    		if (index != 0)   //处理最后一个字符不够的问题
    		{
    			inch <<= (8 - index); //最高位必须装上 后边的浪费掉
    			fputc(inch, finCompress);
    		}
    
    		fclose(finCompress);
    	}

    5.写配置文件:

    string logFile = filename;
    		logFile += ".log";
    		
    		FILE *Log = fopen(logFile.c_str(), "wb");
    		assert(Log);
    
    		string chInfo;
    
    		char str[128] = {0}; //没空间 不可以
    
    		for (size_t i = 1; i < 256; ++i)
    		{
    			if (_infos[i]._count > 0)
    			{
    				chInfo += _infos[i]._ch;
    				chInfo += ',';
    				chInfo += _itoa(_infos[i]._count,str,10);
    				chInfo += '
    ';
    				fputs(chInfo.c_str(), Log);
    				chInfo.clear();
    			}
    		}
    
    		fclose(Log);

    6.最后的文件解压:

    //重构文件
    	void _RestoreFiles(HuffmanTreeNode<FileInfo> *root, const char* Fileneme,long long size)
    	{
    		assert(root);
    
    		//原压缩文件
    		string name = Fileneme;
    		name += ".huf";
    		
    		FILE* Out = fopen(name.c_str(),"rb");
    		assert(Out);
    		
    		string restorefilename = Fileneme;
    		restorefilename += ".over";
    		FILE *over = fopen(restorefilename.c_str(),"wb");
    		assert(over);
    
    		int pos = 8;
    		long long poss = size;
    
    		unsigned char chz = fgetc(Out);
    		while (poss>0)
    		{
    			HuffmanTreeNode<FileInfo>* cur = nullptr;
    			cur = root;
    			while (cur->_left != nullptr || cur->_right != nullptr)
    			{
    				pos--;
    				unsigned char temp = chz >> pos;
    				int ch = 1 & temp;
    				if (ch == 0)
    				{
    					cur = cur->_right;
    				}
    
    				else if (ch == 1)
    				{
    					cur = cur->_left;
    				}
    
    				if (pos == 0)
    				{
    					chz = fgetc(Out);
    					pos = 8;
    				}
    			}
    			fputc(cur->_weight._ch, over);
    
    			poss--;
    		}
    
    		fclose(Out);
    		fclose(over);
    	}
    
    	
    	void UnCompress(const char* Fileneme)//解压缩
    	{
    		//1.打开日志文件
    		//2.根据信息还原哈夫曼树
    		//3.还原信息;
    		string UnCompressneme = Fileneme;
    		UnCompressneme += ".log";
    		FILE *fOutLogFile = fopen(UnCompressneme.c_str(), "rb");
    		assert(fOutLogFile);
    
    		string line;
    		while (_ReadLine(fOutLogFile, line))
    		{
    			unsigned char ch = line[0];
    			_infos[ch]._count = atoi(line.substr(2).c_str());
    			line.clear();
    		} 
    
    		HuffmanTree<FileInfo> f;
    		FileInfo invalid;
    		f.CreateTree(_infos, 256, invalid);
    
    		//根据重建的哈夫曼树 还原文件;
    		long long size = f.GetRoot()->_weight._count;
    		_RestoreFiles(f.GetRoot(), Fileneme,size);
    	}
    到此,此项目基本完成;如遇问题,希望留言,随时解答,如有见解,跪求赐教!






  • 相关阅读:
    AppDomain and related
    实现 Finalize 和 Dispose 以清理非托管资源
    递归显示treeview,求更好方法
    SQL练习题之子查询
    jquery in action 学习笔记
    daily english 201117
    TOP AND APPLY
    Create trace with tsql
    (转)sqlserver 锁查看
    一个简单的windows services demo(c#)
  • 原文地址:https://www.cnblogs.com/melons/p/5791864.html
Copyright © 2011-2022 走看看