zoukankan      html  css  js  c++  java
  • 网页字数统计

    下载

    代码

    using System;
    using System.ComponentModel;
    using System.Globalization;
    using System.Text;
    using System.Threading;
    using System.Windows.Forms;
    
    
    namespace WillCC
    {
        public partial class Form1 : Form
        {
            private static SynchronizationContext _synchronizationContext;
            public Form1()
            {
                _synchronizationContext = SynchronizationContext.Current;
                InitializeComponent();
            }
    
            private void btnGo_Click(object sender, EventArgs e)
            {
                var websites = new BindingList<Website>();
                dgResult.DataSource = websites;
                var topWebsite = new Website(txtUrl.Text);
                websites.Add(topWebsite);
                topWebsite.LinkParsed += url => _synchronizationContext.Post(d => websites.Add(new Website(url)), null);
            }
    
            public class Website : INotifyPropertyChanged
            {
                public event Action<string> LinkParsed;
                private string _网址;
                private string _统计;
    
                public string 网址
                {
                    get { return _网址; }
                    private set
                    {
                        _网址 = value;
                        RefreshProperty("网址");
                    }
                }
    
                public string 统计
                {
                    get { return _统计; }
                    private set
                    {
                        _统计 = value;
                        RefreshProperty("统计");
                    }
                }
    
                public Website(string url)
                {
                    网址 = url;
                    统计 = "";
    
                    var browser = new WebBrowser { ScriptErrorsSuppressed = true, AllowNavigation = false, AllowWebBrowserDrop = false, IsWebBrowserContextMenuEnabled = false, ScrollBarsEnabled = false, WebBrowserShortcutsEnabled = false, };
                    new Action(() =>
                        {
                            browser.Navigating += (ss, ee) => 统计 = "抓取网页中...";
                            browser.DocumentCompleted += (ss, ee) =>
                                {
                                    var text = StripHtml(browser.DocumentText);
                                    var count = Count(text.Replace(" ",""));
                                    统计 = count.ToString(CultureInfo.InvariantCulture);
    
                                    if (LinkParsed != null)
                                    {
                                        if (browser.Document != null)
                                        {
                                            var firstLevelLinks = browser.Document.GetElementsByTagName("A");
                                            var enume = firstLevelLinks.GetEnumerator();
                                            while (enume.MoveNext())
                                            {
                                                var element = enume.Current as HtmlElement;
                                                if (element != null)
                                                {
                                                    var link = element.GetAttribute("href");
                                                    LinkParsed(link);
                                                }
                                            }
                                        }
                                    }
                                    browser.Dispose();
                                };
                            try
                            {
                                browser.Navigate(url);
                            }
                            catch (Exception ex)
                            {
                                统计 = ex.ToString();
                            }
                        }).BeginInvoke(null, null);
                }
    
                private void RefreshProperty(string propertyName)
                {
                    _synchronizationContext.Post(d =>
                       {
                           if (PropertyChanged != null)
                               PropertyChanged(this, new PropertyChangedEventArgs(propertyName));
                       }, null);
                }
    
                private static string StripHtml(string source)
                {
                    string result;
    
                    // Remove HTML Development formatting
                    // Replace line breaks with space
                    // because browsers inserts space
                    result = source.Replace("\r", " ");
                    // Replace line breaks with space
                    // because browsers inserts space
                    result = result.Replace("\n", " ");
                    // Remove step-formatting
                    result = result.Replace("\t", string.Empty);
                    // Remove repeating spaces because browsers ignore them
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                                                                          @"( )+", " ");
    
                    // Remove the header (prepare first by clearing attributes)
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*head([^>])*>", "<head>",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"(<( )*(/)( )*head( )*>)", "</head>",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             "(<head>).*(</head>)", string.Empty,
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // remove all scripts (prepare first by clearing attributes)
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*script([^>])*>", "<script>",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"(<( )*(/)( )*script( )*>)", "</script>",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    //result = System.Text.RegularExpressions.Regex.Replace(result,
                    //         @"(<script>)([^(<script>\.</script>)])*(</script>)",
                    //         string.Empty,
                    //         System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"(<script>).*(</script>)", string.Empty,
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // remove all styles (prepare first by clearing attributes)
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*style([^>])*>", "<style>",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"(<( )*(/)( )*style( )*>)", "</style>",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             "(<style>).*(</style>)", string.Empty,
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // insert tabs in spaces of <td> tags
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*td([^>])*>", "\t",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // insert line breaks in places of <BR> and <LI> tags
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*br( )*>", "\r",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*li( )*>", "\r",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // insert line paragraphs (double line breaks) in place
                    // if <P>, <DIV> and <TR> tags
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*div([^>])*>", "\r\r",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*tr([^>])*>", "\r\r",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*p([^>])*>", "\r\r",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // Remove remaining tags like <a>, links, images,
                    // comments etc - anything that's enclosed inside < >
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<[^>]*>", string.Empty,
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // replace special characters:
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @" ", " ",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"•", " * ",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"‹", "<",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"›", ">",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"™", "(tm)",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"⁄", "/",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<", "<",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @">", ">",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"©", "(c)",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"®", "(r)",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    // Remove all others. More can be added, see
                    // http://hotwired.lycos.com/webmonkey/reference/special_characters/
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"&(.{2,6});", string.Empty,
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // for testing
                    //System.Text.RegularExpressions.Regex.Replace(result,
                    //       this.txtRegex.Text,string.Empty,
                    //       System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // make line breaking consistent
                    result = result.Replace("\n", "\r");
    
                    // Remove extra line breaks and tabs:
                    // replace over 2 breaks with 2 and over 4 tabs with 4.
                    // Prepare first to remove any whitespaces in between
                    // the escaped characters and remove redundant tabs in between line breaks
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             "(\r)( )+(\r)", "\r\r",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             "(\t)( )+(\t)", "\t\t",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             "(\t)( )+(\r)", "\t\r",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             "(\r)( )+(\t)", "\r\t",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    // Remove redundant tabs
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             "(\r)(\t)+(\r)", "\r\r",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    // Remove multiple tabs following a line break with just one tab
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             "(\r)(\t)+", "\r\t",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    // Initial replacement target string for line breaks
                    string breaks = "\r\r\r";
                    // Initial replacement target string for tabs
                    string tabs = "\t\t\t\t\t";
                    for (int index = 0; index < result.Length; index++)
                    {
                        result = result.Replace(breaks, "\r\r");
                        result = result.Replace(tabs, "\t\t\t\t");
                        breaks = breaks + "\r";
                        tabs = tabs + "\t";
                    }
    
                    // That's it.
                    return result;
                }
    
                private static int Count(string text)
                {
                    return Encoding.Unicode.GetCharCount(Encoding.Unicode.GetBytes(text));
                }
    
                public event PropertyChangedEventHandler PropertyChanged;
            }
        }
    }
    



    截图


  • 相关阅读:
    剑指offer_11:二进制中1的个数
    剑指offer_10:矩形覆盖
    spring mvc 访问静态资源
    spring context:component-scan ex
    spring aop配置未生效
    415 Unsupported Media Type
    spring mvc 接收List对象入参
    JIRA甘特图
    JIRA的工时
    JIRA导出工作日志到Excel
  • 原文地址:https://www.cnblogs.com/beta2013/p/3377272.html
Copyright © 2011-2022 走看看