zoukankan      html  css  js  c++  java
  • LZ77.py

    import math
    from bitarray import bitarray
    
    class LZ77Compressor:
    	"""
    	A simplified implementation of the LZ77 Compression Algorithm
    	"""
    	MAX_WINDOW_SIZE = 400
    
    	def __init__(self, window_size=20):
    		self.window_size = min(window_size, self.MAX_WINDOW_SIZE) 
    		self.lookahead_buffer_size = 15 # length of match is at most 4 bits
    
    	def compress(self, input_file_path, output_file_path=None, verbose=False):
    		"""
    		Given the path of an input file, its content is compressed by applying a simple 
    		LZ77 compression algorithm. 
    
    		The compressed format is:
    		0 bit followed by 8 bits (1 byte character) when there are no previous matches
    			within window
    		1 bit followed by 12 bits pointer (distance to the start of the match from the 
    			current position) and 4 bits (length of the match)
    		
    		If a path to the output file is provided, the compressed data is written into 
    		a binary file. Otherwise, it is returned as a bitarray
    
    		if verbose is enabled, the compression description is printed to standard output
    		"""
    		data = None
    		i = 0
    		output_buffer = bitarray(endian='big')
    
    		# read the input file 
    		try:
    			with open(input_file_path, 'rb') as input_file:
    				data = input_file.read()
    		except IOError:
    			print 'Could not open input file ...'
    			raise
    
    		while i < len(data):
    			#print i
    
    			match = self.findLongestMatch(data, i)
    
    			if match: 
    				# Add 1 bit flag, followed by 12 bit for distance, and 4 bit for the length
    				# of the match 
    				(bestMatchDistance, bestMatchLength) = match
    
    				output_buffer.append(True)
    				output_buffer.frombytes(chr(bestMatchDistance >> 4))
    				output_buffer.frombytes(chr(((bestMatchDistance & 0xf) << 4) | bestMatchLength))
    
    				if verbose:
    					print "<1, %i, %i>" % (bestMatchDistance, bestMatchLength),
    
    				i += bestMatchLength
    
    			else:
    				# No useful match was found. Add 0 bit flag, followed by 8 bit for the character
    				output_buffer.append(False)
    				output_buffer.frombytes(data[i])
    				
    				if verbose:
    					print "<0, %s>" % data[i],
    
    				i += 1
    
    		# fill the buffer with zeros if the number of bits is not a multiple of 8		
    		output_buffer.fill()
    
    		# write the compressed data into a binary file if a path is provided
    		if output_file_path:
    			try:
    				with open(output_file_path, 'wb') as output_file:
    					output_file.write(output_buffer.tobytes())
    					print "File was compressed successfully and saved to output path ..."
    					return None
    			except IOError:
    				print 'Could not write to output file path. Please check if the path is correct ...'
    				raise
    
    		# an output file path was not provided, return the compressed data
    		return output_buffer
    
    
    	def decompress(self, input_file_path, output_file_path=None):
    		"""
    		Given a string of the compressed file path, the data is decompressed back to its 
    		original form, and written into the output file path if provided. If no output 
    		file path is provided, the decompressed data is returned as a string
    		"""
    		data = bitarray(endian='big')
    		output_buffer = []
    
    		# read the input file
    		try:
    			with open(input_file_path, 'rb') as input_file:
    				data.fromfile(input_file)
    		except IOError:
    			print 'Could not open input file ...'
    			raise
    
    		while len(data) >= 9:
    
    			flag = data.pop(0)
    
    			if not flag:
    				byte = data[0:8].tobytes()
    
    				output_buffer.append(byte)
    				del data[0:8]
    			else:
    				byte1 = ord(data[0:8].tobytes())
    				byte2 = ord(data[8:16].tobytes())
    
    				del data[0:16]
    				distance = (byte1 << 4) | (byte2 >> 4)
    				length = (byte2 & 0xf)
    
    				for i in range(length):
    					output_buffer.append(output_buffer[-distance])
    		out_data =  ''.join(output_buffer)
    
    		if output_file_path:
    			try:
    				with open(output_file_path, 'wb') as output_file:
    					output_file.write(out_data)
    					print 'File was decompressed successfully and saved to output path ...'
    					return None 
    			except IOError:
    				print 'Could not write to output file path. Please check if the path is correct ...'
    				raise 
    		return out_data
    
    
    	def findLongestMatch(self, data, current_position):
    		""" 
    		Finds the longest match to a substring starting at the current_position 
    		in the lookahead buffer from the history window
    		"""
    		end_of_buffer = min(current_position + self.lookahead_buffer_size, len(data) + 1)
    
    		best_match_distance = -1
    		best_match_length = -1
    
    		# Optimization: Only consider substrings of length 2 and greater, and just 
    		# output any substring of length 1 (8 bits uncompressed is better than 13 bits
    		# for the flag, distance, and length)
    		for j in range(current_position + 2, end_of_buffer):
    
    			start_index = max(0, current_position - self.window_size)
    			substring = data[current_position:j]
    
    			for i in range(start_index, current_position):
    
    				repetitions = len(substring) / (current_position - i)
    
    				last = len(substring) % (current_position - i)
    
    				matched_string = data[i:current_position] * repetitions + data[i:i+last]
    
    				if matched_string == substring and len(substring) > best_match_length:
    					best_match_distance = current_position - i 
    					best_match_length = len(substring)
    
    		if best_match_distance > 0 and best_match_length > 0:
    			return (best_match_distance, best_match_length)
    		return None
    

      

  • 相关阅读:
    Delphi的字符(Char),字符串(String),字符串指针(PChar),字符数组arrayofchar(来自http://delphi.cjcsoft.net/论坛)
    关于Delphi中的字符串的浅析(瓢虫大作,里面有内存错误的举例)
    String[255]在高版本Delphi里还是被解释成Byte,总体长度256,使用StrPCopy可以给Array String拷贝字符串(内含许多实验测试)
    了解JVM加载实例化类的原理
    轻量级 Material Design 前端框架 MDUI (纯html,css,与css框架跟react vue不冲突)
    只学一点点:我的技术学习策略(虽然不赞同,但可以参考一下,针对不常用的技术可以这样,同时可以:寻找遁去的一,不用管别人怎么想;有学习的时间,不如自己写、自己实践,否则学完了都不知道是什么东西)
    clientdataset<---->json
    完全自定义窗体风格的实现
    监控其它进程
    用JSP+JavaBean开发模式实现一个销售额的查询
  • 原文地址:https://www.cnblogs.com/rmthy/p/6238200.html
Copyright © 2011-2022 走看看