LZW(Lempel-Ziv-Welch) is the first widely used universal data compression method on computers. It would typically compress large English texts to about half of their original sizes. Now LZW is still used in GIF and PDF.
The basic idea: a sequence of adjacent input symbols is called a phrase, the phrases are put into a table along reading input stream, the indices of the phrases in the table is used to form the output.
There are two columns in the table: phrase and its index. Each phrase is composed of a prefix and a symbol, the prefix is an index in the table referencing another phrase, the symbol is appended to the prefix to form the new phrase.
Encode Algorithm:
initialize table;
word <- NIL;while (there is input)
{
symbol <- next symbol from input;
phrase <- word + symbol;
if (phrase exists in the table)
{
word <- phrase;
}
else
{
output (index(word));
add phrase to the table;
word <- symbol;
}
}
output (index(word));
Decode Algorithm:
initialize table;
phrase <- NIL;while (there is input)
{
wordIndex <- next code from input;
if (wordIndex exists in the table)
{
word <- dictionary[wordIndex];
phrase <- phrase + head(word);
if(phrase.Length > 1)
{
add phrase to the dictionary;
}
}
else
{
phrase <- phrase + head(phrase);
add phrase to the dictionary;
word <- phrase; //word <- dictionary[wordIndex];
}
phrase <- word;
output (word);
}
I implemented the algorithm in C# according to <PDF Reference>, which includes more encode details:
/// This class represents a Phrase.
/// </summary>
public struct Phrase
{
/// <summary>
/// Gets or sets the PrefixIndex.
/// </summary>
public int PrefixIndex;
/// <summary>
/// Gets or sets the Symbol.
/// </summary>
public int Symbol;
/// <summary>
/// Initializes a new instance of the <see cref="Phrase"/> struct.
/// </summary>
/// <param name="symbol">The symbol.</param>
public Phrase(int symbol)
{
Symbol = symbol;
PrefixIndex = -1;
}
/// <summary>
/// Initializes a new instance of the <see cref="Phrase"/> struct.
/// </summary>
/// <param name="symbol">The symbol.</param>
/// <param name="refIdnex">The ref idnex.</param>
public Phrase(int symbol, int refIdnex)
{
Symbol = symbol;
PrefixIndex = refIdnex;
}
/// <summary>
/// Indicates whether this instance and a specified object are equal.
/// </summary>
/// <param name="obj">Another object to compare to.</param>
/// <returns>
/// true if obj and this instance are the same type and represent the same value; otherwise, false.
/// </returns>
public override bool Equals(object obj)
{
return this.Symbol == ((Phrase)obj).Symbol
&& this.PrefixIndex == ((Phrase)obj).PrefixIndex;
}
/// <summary>
/// Returns the hash code for this instance.
/// </summary>
/// <returns>
/// A 32-bit signed integer that is the hash code for this instance.
/// </returns>
public override int GetHashCode()
{
if (PrefixIndex != -1)
{
return PrefixIndex + Symbol;
}
else
{
return -Symbol;
}
}
/// <summary>
/// Returns the fully qualified type name of this instance.
/// </summary>
/// <returns>
/// A <see cref="T:System.String"></see> containing a fully qualified type name.
/// </returns>
public override string ToString()
{
return String.Format("{0}+{1}", PrefixIndex, Symbol);
}
}
/// Lzw compress algorithm.
/// </summary>
public partial class Lzw
{
/// <summary>
/// Clear Table Marker.
/// </summary>
const int ClearTableMarker = 256;
/// <summary>
/// End Of Data Marker.
/// </summary>
const int EOD = 257;
/// <summary>
/// Encodes the specified input.
/// </summary>
/// <param name="input">The input.</param>
/// <param name="output">The output.</param>
public static void Encode(Stream input, Stream output)
{
MemoryStream memStream = new MemoryStream();
BitStream outStream = new BitStream(memStream);
// <code, numbits>
foreach (Pair<int, int> code in Analyze(input))
{
outStream.WriteBitsBigEndian(code.Left, code.Right);
}
outStream.Flush();
memStream.Position = 0;
BitOrder.Reverse(memStream, output);
}
private static void InitializeTable(Dictionary<Phrase, int> Table)
{
Table.Clear();
for (int i = 0; i <= 257; i++)
{
Table.Add(new Phrase(i), i);
}
}
private static IEnumerable<Pair<int, int>> Analyze(Stream input)
{
Dictionary<Phrase, int> Table = new Dictionary<Phrase, int>();
InitializeTable(Table);
int numbits = 9;
yield return new Pair<int, int>(ClearTableMarker, numbits);
Phrase lastPhrase = new Phrase(-1);
Phrase currentPhrase = new Phrase();
while (true)
{
int symbol = input.ReadByte();
int wordIndex = Table.ContainsKey(lastPhrase) ? Table[lastPhrase] : -1;
if (symbol == -1)
{
yield return new Pair<int, int>(wordIndex, numbits);
break;
}
currentPhrase.PrefixIndex = wordIndex;
currentPhrase.Symbol = symbol;
if (Table.ContainsKey(currentPhrase))
{
lastPhrase = currentPhrase;
}
else
{
yield return new Pair<int, int>(wordIndex, numbits);
if (Table.Count < 4096)
{
Table.Add(currentPhrase, Table.Count);
if (numbits == 9 && Table.Count > 511)
{
numbits = 10;
}
else if (numbits == 10 && Table.Count > 1023)
{
numbits = 11;
}
else if (numbits == 11 && Table.Count > 2047)
{
numbits = 12;
}
}
else
{
InitializeTable(Table);
yield return new Pair<int, int>(ClearTableMarker, numbits);
numbits = 9;
}
lastPhrase.Symbol = symbol;
lastPhrase.PrefixIndex = -1;
}
}
yield return new Pair<int, int>(EOD, numbits);
}
}
/// Lzw Decompress algorithm.
/// </summary>
public partial class Lzw
{
/// <summary>
/// Decodes the specified input.
/// </summary>
/// <param name="input">The input.</param>
/// <param name="output">The output.</param>
public static void Decode(Stream input, Stream output)
{
long pos = input.Position;
MemoryStream memStream = BitOrder.Reverse(input);
BitStream inStream = new BitStream(memStream);
Dictionary<int, Phrase> Table = new Dictionary<int, Phrase>();
Phrase phrase = new Phrase();
int wordIndex = -1;
int numbits = 9;
while (true)
{
int code = inStream.ReadBitsBigEndian(numbits);
if (code == ClearTableMarker)
{
InitializeTable(Table);
wordIndex = -1;
numbits = 9;
}
else if (code == EOD || code == -1)
{
break;
}
else
{
if (code < 256)
{
phrase.PrefixIndex = wordIndex;
phrase.Symbol = code;
if (wordIndex != -1)
{
Table.Add(Table.Count, phrase);
}
output.WriteByte((byte)code);
}
else
{
if (Table.ContainsKey(code))
{
int head = GetHead(Table[code], Table);
phrase.PrefixIndex = wordIndex;
phrase.Symbol = head;
Table.Add(Table.Count, phrase);
}
else
{
int head = GetHead(Table[wordIndex], Table);
phrase.PrefixIndex = wordIndex;
phrase.Symbol = head;
//Table.Count == code
Table.Add(Table.Count, phrase);
}
Output(code, Table, output);
}
if (numbits == 9 && Table.Count > 510)
{
numbits = 10;
}
else if (numbits == 10 && Table.Count > 1022)
{
numbits = 11;
}
else if (numbits == 11 && Table.Count > 2046)
{
numbits = 12;
}
wordIndex = code;
}
}
input.Position = pos + inStream.Position;
}
private static int GetHead(Phrase phrase, Dictionary<int, Phrase> Table)
{
if (phrase.PrefixIndex == -1)
{
return phrase.Symbol;
}
else
{
Phrase entry = Table[phrase.PrefixIndex];
while (entry.PrefixIndex != -1)
{
entry = Table[entry.PrefixIndex];
}
return entry.Symbol;
}
}
private static void Output(int code, Dictionary<int, Phrase> Table, Stream output)
{
List<byte> symbols = new List<byte>();
Phrase entry = Table[code];
while (entry.PrefixIndex != -1)
{
symbols.Add((byte)entry.Symbol);
entry = Table[entry.PrefixIndex];
}
symbols.Add((byte)entry.Symbol);
symbols.Reverse();
output.Write(symbols.ToArray(), 0, symbols.Count);
}
private static void InitializeTable(Dictionary<int, Phrase> Table)
{
Table.Clear();
for (int i = 0; i <= 257; i++)
{
Table.Add(i, new Phrase(i));
}
}
}
/// Revers BitOrder of bytes
/// </summary>
public class BitOrder
{
static readonly byte[] BitReverseTable =
{
0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
};
/// <summary>
/// Reverses the specified value.
/// </summary>
/// <param name="value">The value.</param>
/// <returns></returns>
public static byte Reverse(byte value)
{
return BitReverseTable[value];
}
/// <summary>
/// Reverses the specified input.
/// </summary>
/// <param name="input">The input.</param>
/// <param name="output">The output.</param>
public static void Reverse(Stream input, Stream output)
{
while (true)
{
int value = input.ReadByte();
if (value == -1) break;
output.WriteByte(BitReverseTable[value]);
}
}
/// <summary>
/// Reverses the specified input.
/// </summary>
/// <param name="input">The input.</param>
/// <returns></returns>
public static MemoryStream Reverse(Stream input)
{
MemoryStream memStream = new MemoryStream();
BitOrder.Reverse(input, memStream);
memStream.Position = 0;
return memStream;
}
}
/// BitStream.
/// </summary>
public class BitStream
{
/// <summary>
/// BufferedStream
/// </summary>
BufferedStream stream;
/// <summary>
/// Initializes a new instance of the <see cref="BitStream"/> class.
/// </summary>
/// <param name="stream">The stream.</param>
public BitStream(Stream stream)
{
//this.stream = stream;
this.stream = new BufferedStream(stream);
}
/// <summary>
/// Gets the length.
/// </summary>
/// <value>The length.</value>
public long Length
{
get { return stream.Length; }
}
/// <summary>
/// Gets or sets the position.
/// </summary>
/// <value>The position.</value>
public long Position
{
get { return stream.Position; }
set { stream.Position = value; }
}
/// <summary>
/// Gets a value indicating whether this instance is end reached.
/// </summary>
/// <value>
/// <c>true</c> if this instance is end reached; otherwise, <c>false</c>.
/// </value>
public bool IsEndReached
{
get { return Position == Length; }
}
BitArray bitBuffer;
/// <summary>
/// bitsRead
/// </summary>
public int bitsRead = 0;
int bitsRemained = 0;
/// <summary>
/// Read num bits from stream, return as a machine integer stored with the most-significant bit first
/// return -1 if not enough bits remained.
/// </summary>
/// <param name="num"></param>
/// <returns></returns>
public int ReadBitsBigEndian(int num)
{
if (num < 1 || num > 16)
{
throw new ArgumentOutOfRangeException("number of bits");
}
if (num <= bitsRemained)
{
int result = GetBinaryInteger(bitBuffer, bitsRead, num);
bitsRead += num;
bitsRemained -= num;
return result;
}
else
{
int count = num - bitsRemained;
int bytesToRead = count <= 8 ? 1 : 2;
byte[] data = ReadBytes(bytesToRead);
if (data.Length == bytesToRead)
{
int result = GetBinaryInteger(bitBuffer, bitsRead, bitsRemained);
bitBuffer = new BitArray(data);
result = GetBinaryInteger(result, bitBuffer, 0, count);
bitsRead = count;
bitsRemained = bytesToRead * 8 - count;
return result;
}
else
{
return -1;
}
}
}
/// <summary>
/// Gets the binary integer.
/// </summary>
/// <param name="array">The array.</param>
/// <param name="start">The start.</param>
/// <param name="count">The count.</param>
/// <returns></returns>
private static int GetBinaryInteger(BitArray array, int start, int count)
{
return GetBinaryInteger(0, array, start, count);
}
/// <summary>
/// Gets the binary integer.
/// </summary>
/// <param name="initial">The initial.</param>
/// <param name="array">The array.</param>
/// <param name="start">The start.</param>
/// <param name="count">The count.</param>
/// <returns></returns>
private static int GetBinaryInteger(int initial, BitArray array, int start, int count)
{
int result = initial;
for (int n = start; n < start + count; n++)
{
int bit = array[n] ? 1 : 0;
result = result * 2 + bit;
}
return result;
}
/// <summary>
/// Read Bits Little Endian
/// </summary>
/// <param name="num"></param>
/// <returns></returns>
public int ReadBits(int num)
{
int result = ReadBitsBigEndian(num);
BitArray bits = new BitArray(BitConverter.GetBytes(result));
return GetBinaryInteger(0, bits, 0, num);
}
/// <summary>
/// Read one bit.
/// </summary>
/// <returns></returns>
public int ReadBit()
{
return ReadBitsBigEndian(1);
}
/// <summary>
/// go to the next byte boundary, former unread bits are ignored.
/// </summary>
public void GotoNextByte()
{
bitsRead = 0;
bitsRemained = 0;
}
/// <summary>
/// Reads the byte.
/// </summary>
/// <returns></returns>
public byte ReadByte()
{
return (byte)stream.ReadByte();
}
/// <summary>
/// Reads the byte.
/// </summary>
/// <param name="offset">The offset.</param>
/// <returns></returns>
public int ReadByte(long offset)
{
stream.Position = offset;
return stream.ReadByte();
}
/// <summary>
/// Reads the bytes.
/// </summary>
/// <param name="count">The count.</param>
/// <returns></returns>
public byte[] ReadBytes(int count)
{
byte[] data = new byte[count];
int bytesRead = stream.Read(data, 0, count);
if (bytesRead == count)
{
return data;
}
else
{
byte[] bytes = new byte[bytesRead];
if (bytesRead > 0)
{
Array.Copy(data, bytes, bytesRead);
}
return bytes;
}
}
/// <summary>
/// Reads to end.
/// </summary>
/// <returns></returns>
public byte[] ReadToEnd()
{
int bytesRemained = (int)(Length - Position);
return ReadBytes(bytesRemained);
}
/// <summary>
/// Peeks the bytes.
/// </summary>
/// <param name="count">The count.</param>
/// <returns></returns>
public byte[] PeekBytes(int count)
{
return PeekBytes(this.Position, count);
}
/// <summary>
/// Peeks the bytes.
/// </summary>
/// <param name="offset">The offset.</param>
/// <param name="length">The length.</param>
/// <returns></returns>
public byte[] PeekBytes(long offset, int length)
{
long pos = this.Position;
this.Position = offset;
byte[] data = ReadBytes(length);
this.Position = pos;
return data;
}
/// <summary>
/// Reads the UInt16.
/// </summary>
/// <returns></returns>
public ushort ReadUInt16()
{
byte[] data = ReadBytes(2);
return BitConverter.ToUInt16(data, 0);
}
uint writebuffer = 0;
/// <summary>
///
/// </summary>
public int bitsWritten = 0;
/// <summary>
/// Writes the bits.
/// </summary>
/// <param name="value">The value.</param>
/// <param name="num">The number of bits.</param>
public void WriteBits(int value, int num)
{
if (num == 0) return;
BitArray bits = new BitArray(BitConverter.GetBytes(value));
for (int i = 0; i < num; i++)
{
uint bit = bits[i] ? 0x80000000 : 0;
writebuffer = (writebuffer >> 1) | bit;
bitsWritten++;
if (bitsWritten == 32)
{
WriteBytes(BitConverter.GetBytes(writebuffer));
ClearWriteBuffer();
}
}
}
/// <summary>
/// Writes the bits big endian.
/// </summary>
/// <param name="value">The value.</param>
/// <param name="num">The number of bits.</param>
public void WriteBitsBigEndian(int value, int num)
{
BitArray bits = new BitArray(BitConverter.GetBytes(value));
int result = GetBinaryInteger(0, bits, 0, num);
WriteBits(result, num);
}
/// <summary>
/// Writes the bit.
/// </summary>
/// <param name="bit">if set to <c>true</c> [bit].</param>
public void WriteBit(bool bit)
{
WriteBit(bit ? 1 : 0);
}
/// <summary>
/// Writes the bit.
/// </summary>
/// <param name="bit">The bit.</param>
public void WriteBit(int bit)
{
WriteBits(bit, 1);
}
private void ClearWriteBuffer()
{
writebuffer = 0;
bitsWritten = 0;
}
/// <summary>
/// Flush bits in buffer, zero bits are apended to form a byte border.
/// Or flush read when reading.
/// </summary>
public void Flush()
{
if (bitsWritten > 0)
{
writebuffer = writebuffer >> (32 - bitsWritten);
byte[] bytes = BitConverter.GetBytes(writebuffer);
int count = bitsWritten / 8;
if (bitsWritten % 8 != 0)
{
count++;
}
stream.Write(bytes, 0, count);
ClearWriteBuffer();
}
stream.Flush();
}
/// <summary>
/// Writes the byte.
/// </summary>
/// <param name="value">The value.</param>
public void WriteByte(byte value)
{
stream.WriteByte(value);
}
/// <summary>
/// Writes the bytes.
/// </summary>
/// <param name="data">The data.</param>
public void WriteBytes(byte[] data)
{
stream.Write(data, 0, data.Length);
}
/// <summary>
/// Writes the bytes.
/// </summary>
/// <param name="data">The data.</param>
/// <param name="offset">The offset.</param>
/// <param name="count">The count.</param>
public void WriteBytes(byte[] data, int offset, int count)
{
stream.Write(data, offset, count);
}
}