zoukankan      html  css  js  c++  java
  • 蛙蛙推荐:蛙蛙牌XML压缩算法

    摘要:

    在用XML传输数据的时候,XML本身的元素名称,属性名称可能比有效的信息量占的地方还要大,本文示例一种简单实用的算法来进行XML压缩,主要思路是把XML标签和属性用整数来表示以便降低传输量。

    单元测试代码

    class Program {
       
    public static string XML = @"<?xml version=""1.0"" encoding=""utf-16""?>
        <Customer>
    <CustomerID>ALFKI</CustomerID>
    <PO>9572658</PO>
    <Address AddressType=""work"">
        <Street>One Main Street</Street>
        <City>Anywhere</City>
        <State>NJ</State>
        <Zip>08080</Zip>
    </Address>
    <Order>
        <OrderID>10966</OrderID >
        <LineItem>
            <ProductID>37</ProductID>
            <UnitPrice>26.50 </UnitPrice>
            <Quantity>8</Quantity>
            <Description>Gravad lax </Description>             
        </LineItem>
        <LineItem>
            <ProductID>56 </ProductID>
            <UnitPrice>38.00</UnitPrice>
            <Quantity>12</Quantity>
            <Description>Gnocchi di nonna Alice</Description>             
        </LineItem>
    </Order>    
    </Customer>
    ";
    static void Main(string[] args) {
        XmlZip zip 
    = new XmlZip();

        
    byte[] bs = Encoding.UTF8.GetBytes(XML);
        Console.WriteLine(
    "原始文件长度:{0}", bs.Length);
        MemoryStream ms 
    = new MemoryStream();
        DeflateStream compressedzipStream 
    = new DeflateStream(ms, CompressionMode.Compress, true);
        compressedzipStream.Write(bs, 
    0, bs.Length);
        compressedzipStream.Close();
        Console.WriteLine(
    "Deflate压缩后长度: {0}", ms.Length);

        zip.Init(XML);
        bs 
    = zip.XmlToBytes(XML);
        Console.WriteLine(
    "XML压缩后长度:{0}", bs.Length);
        
    string str = zip.BytesToXml(bs);
        Console.WriteLine(
    "还原后长度:{0}", Encoding.UTF8.GetByteCount(str));
        Console.WriteLine(str);


        ms 
    = new MemoryStream();
        compressedzipStream 
    = new DeflateStream(ms, CompressionMode.Compress, true);
        compressedzipStream.Write(bs, 
    0, bs.Length);
        compressedzipStream.Close();
        Console.WriteLine(
    "先XML压缩,再Deflate压缩后的长度:{0}", ms.Length);
        Console.ReadKey();

    }
    }

    测试输出

    原始文件长度:740
    Deflate压缩后长度: 438
    XML压缩后长度:295
    还原后长度:727
    <?xml version="1.0" encoding="utf-16"?>
    <Customer>
      <CustomerID>ALFKI</CustomerID>
      <PO>9572658</PO>
      <Address AddressType="work">
        <Street>One Main Street</Street>
        <City>Anywhere</City>
        <State>NJ</State>
        <Zip>08080</Zip>
      </Address>
      <Order>
        <OrderID>10966</OrderID>
        <LineItem>
          <ProductID>37</ProductID>
          <UnitPrice>26.50 </UnitPrice>
          <Quantity>8</Quantity>
          <Description>Gravad lax </Description>             
        </LineItem>
        <LineItem>
          <ProductID>56 </ProductID>
          <UnitPrice>38.00</UnitPrice>
          <Quantity>12</Quantity>
          <Description>Gnocchi di nonna Alice</Description>             
        </LineItem>
      </Order>
    </Customer>
    先XML压缩,再Deflate压缩后的长度:357

    可以看到,压缩后的数据约是原来数据的3分之一,可能没有其它专有的压缩算法的压缩率高,但效果还算是满意吧,而且我的算法是比较通用的,只要通信双方知道了XMLSchema,甚至双方只需要有一段完整的示例代码,就可以进行压缩通信,只做了功能测试,没做性能测试,大家可以先借鉴下思路。

    完整代码

    大致原理,就是通信双方各持有一个XML文档节点名称,属性名称的一个字典,然后发送方传输的时候用ushort代替原有的XML标签和属性名,接收方通过字典把ushort再转换成原始的元素名和属性名,这样大量不必要的重复的标签等就省去了。

    代码只做本文的示例,写的比较随意,没有什么防御性和健壮性。

    internal enum ItemType {
        Element,
        Attritube
    }
    internal class XmlNodeItem {
        
    public string Xpath { getset; }
        
    public string Text { getset; }
        
    public ItemType ItemType { getset; }
        
    public override string ToString() {
            
    return Xpath;
        }
    }
    internal class MyXpath {
        LinkedList
    <string> _node = new LinkedList<string>();
        
    public void AddElement(string name) {
            _node.AddLast(
    string.Format("/{0}", name));
        }
        
    public void AddAttribute(string name) {
            _node.AddLast(
    string.Format("/@{0}", name));
        }
        
    public void RemoveLastElement() {
            _node.RemoveLast();
        }
        
    public override string ToString() {
            StringBuilder sb 
    = new StringBuilder();
            LinkedListNode
    <string> node = _node.First;
            sb.Append(node.Value);
            
    while ((node = node.Next) != null) {
                sb.Append(node.Value);
            }
            
    return sb.ToString();
        }
    }
    class XmlZip {
        Dictionary
    <ushort, XmlNodeItem> _map = new Dictionary<ushort, XmlNodeItem>();
        Dictionary
    <stringushort> _map2 = new Dictionary<stringushort>();
        MyXpath _path 
    = new MyXpath();

        
    public void Init(string xmlInput) {
            StringReader sr 
    = new StringReader(xmlInput);
            XmlReader reader 
    = XmlReader.Create(sr);
            MemoryStream ms 
    = new MemoryStream();
            
    ushort i = 1;
            
    while (reader.Read()) {
                
    switch (reader.NodeType) {
                    
    case XmlNodeType.Element:
                        _path.AddElement(reader.Name);
                        _map[i
    ++= new XmlNodeItem() {
                            Xpath 
    = _path.ToString(),
                            Text 
    = reader.Name,
                            ItemType 
    = ItemType.Element
                        };
                        
    if (reader.HasAttributes) {
                            reader.MoveToFirstAttribute();
                            _path.AddAttribute(reader.Name);
                            _map[i
    ++= new XmlNodeItem() {
                                Xpath 
    = _path.ToString(),
                                Text 
    = reader.Name,
                                ItemType 
    = ItemType.Attritube
                            };
                            _path.RemoveLastElement();
                            
    while (reader.MoveToNextAttribute()) {
                                _path.AddAttribute(reader.Name);
                                _map[i
    ++= new XmlNodeItem() {
                                    Xpath 
    = _path.ToString(),
                                    Text 
    = reader.Name,
                                    ItemType 
    = ItemType.Attritube
                                };
                                _path.RemoveLastElement();
                            }
                            reader.MoveToElement();
                        }
                        
    if (reader.IsEmptyElement) _path.RemoveLastElement();
                        
    break;
                    
    case XmlNodeType.EndElement:
                        _path.RemoveLastElement();
                        
    break;
                    
    default:
                        
    break;
                }
            }
            
    foreach (KeyValuePair<ushort, XmlNodeItem> pair in _map) {
                _map2[pair.Value.Xpath] 
    = pair.Key;
            }
        }

        
    public byte[] XmlToBytes(string xmlInput) {
            StringReader sr 
    = new StringReader(xmlInput);
            XmlReader reader 
    = XmlReader.Create(sr);
            MemoryStream ms 
    = new MemoryStream();
            BinaryWriter bw 
    = new BinaryWriter(ms);
            
    while (reader.Read()) {
                
    ushort index;
                
    byte[] bs;
                
    switch (reader.NodeType) {
                    
    case XmlNodeType.Element:
                        _path.AddElement(reader.Name);
                        
    if (_map2.TryGetValue(_path.ToString(), out index)) {
                            bw.Write(index);
                        }
                        
    if (reader.HasAttributes) {
                            reader.MoveToFirstAttribute();
                            _path.AddAttribute(reader.Name);
                            
    if (_map2.TryGetValue(_path.ToString(), out index)) {
                                _path.RemoveLastElement();
                                bw.Write(index);
                                bs 
    = Encoding.UTF8.GetBytes(reader.Value);
                                bw.Write((
    ushort)bs.Length);
                                bw.Write(bs);
                            }
                            
    while (reader.MoveToNextAttribute()) {
                                _path.AddAttribute(reader.Name);
                                
    if (_map2.TryGetValue(_path.ToString(), out index)) {
                                    _path.RemoveLastElement();
                                    bw.Write(index);
                                    bs 
    = Encoding.UTF8.GetBytes(reader.Value);
                                    bw.Write((
    ushort)bs.Length);
                                    bw.Write(bs);
                                }
                            }
                            reader.MoveToElement();
                        }
                        
    if (reader.IsEmptyElement) {
                            _path.RemoveLastElement();
                            bw.Write(
    ushort.MaxValue);
                        }
                        
    break;
                    
    case XmlNodeType.EndElement:
                        _path.RemoveLastElement();
                        bw.Write(
    ushort.MaxValue);
                        
    break;
                    
    case XmlNodeType.Text:
                        bw.Write((
    ushort)0);
                        bs 
    = Encoding.UTF8.GetBytes(reader.Value);
                        bw.Write((
    ushort)bs.Length);
                        bw.Write(bs);
                        
    break;
                    
    default:
                        
    break;
                }
            }
            bw.Close();
            ms.Close();
            reader.Close();
            
    return ms.ToArray();
        }

        
    public string BytesToXml(byte[] bytes) {
            MemoryStream ms 
    = new MemoryStream(bytes);
            BinaryReader br 
    = new BinaryReader(ms);
            StringBuilder sb 
    = new StringBuilder();
            StringWriter sw 
    = new StringWriter(sb);
            XmlWriterSettings settings 
    = new XmlWriterSettings();
            settings.Indent 
    = true;
            XmlWriter writer 
    = XmlWriter.Create(sw, settings);

            XmlNodeItem item;
            
    while (br.PeekChar() != -1) {
                
    ushort readFlag = br.ReadUInt16();
                
    int len;
                
    byte[] bs;
                
    string str;
                
    if (_map.TryGetValue(readFlag, out item)) {
                    
    if (item.ItemType == ItemType.Element)
                        writer.WriteStartElement(item.Text);
                    
    else if (item.ItemType == ItemType.Attritube) {
                        len 
    = br.ReadUInt16();
                        bs 
    = br.ReadBytes(len);
                        str 
    = Encoding.UTF8.GetString(bs);
                        writer.WriteAttributeString(item.Text, str);
                    }
                }
                
    else if (readFlag == 0) {
                    len 
    = br.ReadUInt16();
                    bs 
    = br.ReadBytes(len);
                    str 
    = Encoding.UTF8.GetString(bs);
                    writer.WriteString(str);
                }
                
    else if (readFlag == ushort.MaxValue) {
                    writer.WriteEndElement();
                }
            }
            writer.Flush();
            writer.Close();
            sw.Close();
            br.Close();
            
    return sb.ToString();
        }
    }

    参考链接

    XML压缩和传输性能的改善

    http://blog.csdn.net/BruceWayen/archive/2006/03/13/623483.aspx

    XQzipXML压缩技术(1--介绍

    http://qiyanfeng.blog.51cto.com/503144/105203

    XQzip:可查询MXL压缩算法分析(1)

    http://qiyanfeng.blog.51cto.com/503144/105578

    WAP Binary XML Content Format

    http://www.w3.org/TR/wbxml/

  • 相关阅读:
    BNU 51002 BQG's Complexity Analysis
    BNU OJ 51003 BQG's Confusing Sequence
    BNU OJ 51000 BQG's Random String
    BNU OJ 50999 BQG's Approaching Deadline
    BNU OJ 50998 BQG's Messy Code
    BNU OJ 50997 BQG's Programming Contest
    CodeForces 609D Gadgets for dollars and pounds
    CodeForces 609C Load Balancing
    CodeForces 609B The Best Gift
    CodeForces 609A USB Flash Drives
  • 原文地址:https://www.cnblogs.com/onlytiancai/p/XmlCompression.html
Copyright © 2011-2022 走看看