zoukankan      html  css  js  c++  java
  • LZ77.py

    import math
    from bitarray import bitarray
    
    class LZ77Compressor:
    	"""
    	A simplified implementation of the LZ77 Compression Algorithm
    	"""
    	MAX_WINDOW_SIZE = 400
    
    	def __init__(self, window_size=20):
    		self.window_size = min(window_size, self.MAX_WINDOW_SIZE) 
    		self.lookahead_buffer_size = 15 # length of match is at most 4 bits
    
    	def compress(self, input_file_path, output_file_path=None, verbose=False):
    		"""
    		Given the path of an input file, its content is compressed by applying a simple 
    		LZ77 compression algorithm. 
    
    		The compressed format is:
    		0 bit followed by 8 bits (1 byte character) when there are no previous matches
    			within window
    		1 bit followed by 12 bits pointer (distance to the start of the match from the 
    			current position) and 4 bits (length of the match)
    		
    		If a path to the output file is provided, the compressed data is written into 
    		a binary file. Otherwise, it is returned as a bitarray
    
    		if verbose is enabled, the compression description is printed to standard output
    		"""
    		data = None
    		i = 0
    		output_buffer = bitarray(endian='big')
    
    		# read the input file 
    		try:
    			with open(input_file_path, 'rb') as input_file:
    				data = input_file.read()
    		except IOError:
    			print 'Could not open input file ...'
    			raise
    
    		while i < len(data):
    			#print i
    
    			match = self.findLongestMatch(data, i)
    
    			if match: 
    				# Add 1 bit flag, followed by 12 bit for distance, and 4 bit for the length
    				# of the match 
    				(bestMatchDistance, bestMatchLength) = match
    
    				output_buffer.append(True)
    				output_buffer.frombytes(chr(bestMatchDistance >> 4))
    				output_buffer.frombytes(chr(((bestMatchDistance & 0xf) << 4) | bestMatchLength))
    
    				if verbose:
    					print "<1, %i, %i>" % (bestMatchDistance, bestMatchLength),
    
    				i += bestMatchLength
    
    			else:
    				# No useful match was found. Add 0 bit flag, followed by 8 bit for the character
    				output_buffer.append(False)
    				output_buffer.frombytes(data[i])
    				
    				if verbose:
    					print "<0, %s>" % data[i],
    
    				i += 1
    
    		# fill the buffer with zeros if the number of bits is not a multiple of 8		
    		output_buffer.fill()
    
    		# write the compressed data into a binary file if a path is provided
    		if output_file_path:
    			try:
    				with open(output_file_path, 'wb') as output_file:
    					output_file.write(output_buffer.tobytes())
    					print "File was compressed successfully and saved to output path ..."
    					return None
    			except IOError:
    				print 'Could not write to output file path. Please check if the path is correct ...'
    				raise
    
    		# an output file path was not provided, return the compressed data
    		return output_buffer
    
    
    	def decompress(self, input_file_path, output_file_path=None):
    		"""
    		Given a string of the compressed file path, the data is decompressed back to its 
    		original form, and written into the output file path if provided. If no output 
    		file path is provided, the decompressed data is returned as a string
    		"""
    		data = bitarray(endian='big')
    		output_buffer = []
    
    		# read the input file
    		try:
    			with open(input_file_path, 'rb') as input_file:
    				data.fromfile(input_file)
    		except IOError:
    			print 'Could not open input file ...'
    			raise
    
    		while len(data) >= 9:
    
    			flag = data.pop(0)
    
    			if not flag:
    				byte = data[0:8].tobytes()
    
    				output_buffer.append(byte)
    				del data[0:8]
    			else:
    				byte1 = ord(data[0:8].tobytes())
    				byte2 = ord(data[8:16].tobytes())
    
    				del data[0:16]
    				distance = (byte1 << 4) | (byte2 >> 4)
    				length = (byte2 & 0xf)
    
    				for i in range(length):
    					output_buffer.append(output_buffer[-distance])
    		out_data =  ''.join(output_buffer)
    
    		if output_file_path:
    			try:
    				with open(output_file_path, 'wb') as output_file:
    					output_file.write(out_data)
    					print 'File was decompressed successfully and saved to output path ...'
    					return None 
    			except IOError:
    				print 'Could not write to output file path. Please check if the path is correct ...'
    				raise 
    		return out_data
    
    
    	def findLongestMatch(self, data, current_position):
    		""" 
    		Finds the longest match to a substring starting at the current_position 
    		in the lookahead buffer from the history window
    		"""
    		end_of_buffer = min(current_position + self.lookahead_buffer_size, len(data) + 1)
    
    		best_match_distance = -1
    		best_match_length = -1
    
    		# Optimization: Only consider substrings of length 2 and greater, and just 
    		# output any substring of length 1 (8 bits uncompressed is better than 13 bits
    		# for the flag, distance, and length)
    		for j in range(current_position + 2, end_of_buffer):
    
    			start_index = max(0, current_position - self.window_size)
    			substring = data[current_position:j]
    
    			for i in range(start_index, current_position):
    
    				repetitions = len(substring) / (current_position - i)
    
    				last = len(substring) % (current_position - i)
    
    				matched_string = data[i:current_position] * repetitions + data[i:i+last]
    
    				if matched_string == substring and len(substring) > best_match_length:
    					best_match_distance = current_position - i 
    					best_match_length = len(substring)
    
    		if best_match_distance > 0 and best_match_length > 0:
    			return (best_match_distance, best_match_length)
    		return None
    

      

  • 相关阅读:
    【Oracle 故障处理一则】 ORA-600
    【转载】关于物化视图
    【转载】OTLP和OLAP的区别
    【转载】Linux追加虚拟内存Swap
    【转载】Keepalived 的使用
    【识记】修复oracle的坏块
    K-means算法[聚类算法]
    决策树ID3算法[分类算法]
    6)图[1]之邻接矩阵存储[深度遍历和广度遍历]
    2)杨辉三角[2]递归实现
  • 原文地址:https://www.cnblogs.com/rmthy/p/6238200.html
Copyright © 2011-2022 走看看