zoukankan      html  css  js  c++  java
  • 无法判断目标网站编码的解决方法

    /// <summary>
            /// 函数名称:GetDataFromUrl
            /// 功能说明:获取url指定的网页的源码
            /// 参数:string url用于指定 url
            /// 参数:ref Encoding encode用来获取网页中的字符集编码
            /// </summary>
            public static string GetDataFromUrl(string url, ref Encoding encode)
            {
                string str = string.Empty;
                HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
    
                //设置http头
                request.AllowAutoRedirect = true;
                request.AllowWriteStreamBuffering = true;
                request.Referer = "";
                request.Timeout = 10 * 1000;
                request.UserAgent = "";
                HttpWebResponse response = null;
                response = (HttpWebResponse)request.GetResponse();
    
                //根据http应答的http头来判断编码
                string characterSet = response.CharacterSet;
                //Encoding encode;
                if (characterSet != "")
                {
                    if (characterSet == "ISO-8859-1")
                    {
                        characterSet = "gb2312";
                    }
                    encode = Encoding.GetEncoding(characterSet);
                }
                else
                {
                    encode = Encoding.Default;
                }
    
                //声明一个内存流来保存http应答流
                Stream receiveStream = response.GetResponseStream();
                MemoryStream mStream = new MemoryStream();
    
                byte[] bf = new byte[255];
                int count = receiveStream.Read(bf, 0, 255);
                while (count > 0)
                {
                    mStream.Write(bf, 0, count);
                    count = receiveStream.Read(bf, 0, 255);
                }
                receiveStream.Close();
    
                mStream.Seek(0, SeekOrigin.Begin);
    
                //从内存流里读取字符串
                StreamReader reader = new StreamReader(mStream, encode);
                char[] buffer = new char[1024];
                count = reader.Read(buffer, 0, 1024);
                while (count > 0)
                {
                    str += new String(buffer, 0, count);
                    count = reader.Read(buffer, 0, 1024);
                }
    
                //从解析出的字符串里判断charset,如果和http应答的编码不一直
                //那么以页面声明的为准,再次从内存流里重新读取文本
                Regex reg =
                   new Regex(@"<meta[sS]+?charset=(.*?)""[sS]+?>",
                              RegexOptions.Multiline | RegexOptions.IgnoreCase);
                MatchCollection mc = reg.Matches(str);
                if (mc.Count > 0)
                {
                    string tempCharSet = mc[0].Result("$1");
                    if (string.Compare(tempCharSet, characterSet, true) != 0)
                    {
                        encode = Encoding.GetEncoding(tempCharSet);
                        str = string.Empty;
                        mStream.Seek(0, SeekOrigin.Begin);
                        reader = new StreamReader(mStream, encode);
                        buffer = new char[255];
                        count = reader.Read(buffer, 0, 255);
                        while (count > 0)
                        {
                            str += new String(buffer, 0, count);
                            count = reader.Read(buffer, 0, 255);
                        }
                    }
                }
                reader.Close();
                mStream.Close();
                if (response != null)
                    response.Close();
    
                return str;
    
            }
  • 相关阅读:
    cannot import name 'PILLOW_VERSION'
    scala spark2.0 rdd dataframe 分布式计算欧式距离
    scala spark dataframe 修改字段类型
    获取cookie脚本
    Loadrunner 获取请求的返回结果函数web_reg_save_param
    Python模拟接口登录
    web自动化上传附件 2
    Web自动化附件上传
    robotframework 连接mysql数据库
    Json格式获取接口返回的值
  • 原文地址:https://www.cnblogs.com/muxueyuan/p/4522292.html
Copyright © 2011-2022 走看看