zoukankan      html  css  js  c++  java
  • 获取网页的Encoding

    在下载Html页面的时候,我们需要得到它的String,就必须得到它的Encoding,得到Encoding的方法很简单,在这里写下自己用到的code,做一下笔记。

    代码其实都很简单,里面还有些简单的注释,很容易理解!

    得到Encoding部分的代码:

    View Code
            /// <summary>
    /// 通过contentType和字节数组得到html的Encoding
    /// </summary>
    /// <param name="contentType">WebClient下载得到的contentType</param>
    /// <param name="myData">WebClient下载得到的字节数组</param>
    /// <returns>得到html页面的Encoding,如果程序无法得到,则默认返回utf-8编码</returns>
    public static Encoding GetHtmlEncoding(string contentType, byte[] myData)
    {
    try
    {
    Encoding encoding;
    ///先得到字节数组的Asic编码字符串
    ///之后针对这个字符串进行分析,得到Encoding
    string ansiContent = Encoding.ASCII.GetString(myData);

    ///第一步:通过正则匹配charset,这个就是网页的Encoding标识
    encoding = GetHtmlEncodingFormString(ansiContent);

    ///如果得到Encoding,就返回,否则继续执行其他步骤
    if (encoding != null)
    return encoding;

    ///第二步:通过contentType得到Encoding
    encoding = GetEncodingFromContentType(contentType);

    ///如果得到Encoding,就返回,否则继续执行其他步骤
    if (encoding != null)
    return encoding;

    ///第三步:通过一个开源的类库,得到Encoding
    ///具体它怎么得到的,我也没有仔细研究
    encoding = GetEncodingFromBytes(myData);

    if (encoding != null)
    return encoding;

    return Encoding.UTF8;
    }
    catch
    {
    return Encoding.UTF8;
    }
    }

    /// <summary>
    /// 第一步:通过正则匹配charset,这个就是网页的Encoding标识
    /// </summary>
    /// <param name="htmlContent">Asic字符串</param>
    /// <returns>Encoding</returns>
    private static Encoding GetHtmlEncodingFormString(string htmlContent)
    {
    string encondingString = null;
    Regex CharsetReg = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

    if (CharsetReg.IsMatch(htmlContent))
    {
    encondingString = CharsetReg.Match(htmlContent).Groups["charset"].Value.Trim();
    try
    {
    Encoding encoding = Encoding.GetEncoding(encondingString);
    return encoding;
    }
    catch { return null; }
    }
    else
    return null;
    }

    private static Encoding GetEncodingFromContentType(string contentType)
    {
    try
    {
    string[] strArray = contentType.ToLower(CultureInfo.InvariantCulture).Split(new char[] { ';', '=', ' ' });
    bool flag = false;
    foreach (string str2 in strArray)
    {
    if (str2 == "charset")
    flag = true;
    else if (flag)
    return Encoding.GetEncoding(str2);
    }
    }
    catch { }

    return null;
    }

    private static Encoding GetEncodingFromBytes(byte[] myData)
    {
    try
    {
    UniversalDetector Det = new UniversalDetector(null);
    Det.HandleData(myData, 0, myData.Length);
    Det.DataEnd();

    if (!string.IsNullOrEmpty(Det.GetDetectedCharset()))
    return Encoding.GetEncoding(Det.GetDetectedCharset());
    }
    catch { }

    return null;
    }

    测试的代码:

    View Code
    public static void TestEncoding()
    {
    WebClient client = new WebClient();
    //WebClient下载得到字节数组
    byte[] bytes = client.DownloadData("http://www.baidu.com");
    //通过Webclient得到contenttype
    string contentType = client.ResponseHeaders["Content-Type"];
    //调用方法得到Encoding
    Encoding encoding = GetHtmlEncoding(contentType, bytes);
    //通过Encoding得到html字符串
    string htmlString = encoding.GetString(bytes);
    Console.WriteLine(htmlString);
    }

    需要用到的第三方类库为:NUniversalCharDet

    引用:using Mozilla.NUniversalCharDet;

    这个dll从网上download一个就可以了!


  • 相关阅读:
    CS027th: 6papers
    MATH026th: 《矩斋筹算丛刻》
    MATH026th: 《古今算学丛书》目录
    Compiler25th005: Excel Compiler
    AIIE25th004: 2020aiie在合肥举办
    AIIE21th003: 2021年第二届国际工业工程和人工智能大会(IEAI 2021)
    ComPiler200004:Library-Oriented Programming
    ComPiler200003:Story-Oriented Programming
    ComPiler200002:Growing a Compiler
    conda
  • 原文地址:https://www.cnblogs.com/pmars/p/2302272.html
Copyright © 2011-2022 走看看