zoukankan      html  css  js  c++  java
  • 网页字数统计

    下载

    代码

    using System;
    using System.ComponentModel;
    using System.Globalization;
    using System.Text;
    using System.Threading;
    using System.Windows.Forms;
    
    
    namespace WillCC
    {
        public partial class Form1 : Form
        {
            private static SynchronizationContext _synchronizationContext;
            public Form1()
            {
                _synchronizationContext = SynchronizationContext.Current;
                InitializeComponent();
            }
    
            private void btnGo_Click(object sender, EventArgs e)
            {
                var websites = new BindingList<Website>();
                dgResult.DataSource = websites;
                var topWebsite = new Website(txtUrl.Text);
                websites.Add(topWebsite);
                topWebsite.LinkParsed += url => _synchronizationContext.Post(d => websites.Add(new Website(url)), null);
            }
    
            public class Website : INotifyPropertyChanged
            {
                public event Action<string> LinkParsed;
                private string _网址;
                private string _统计;
    
                public string 网址
                {
                    get { return _网址; }
                    private set
                    {
                        _网址 = value;
                        RefreshProperty("网址");
                    }
                }
    
                public string 统计
                {
                    get { return _统计; }
                    private set
                    {
                        _统计 = value;
                        RefreshProperty("统计");
                    }
                }
    
                public Website(string url)
                {
                    网址 = url;
                    统计 = "";
    
                    var browser = new WebBrowser { ScriptErrorsSuppressed = true, AllowNavigation = false, AllowWebBrowserDrop = false, IsWebBrowserContextMenuEnabled = false, ScrollBarsEnabled = false, WebBrowserShortcutsEnabled = false, };
                    new Action(() =>
                        {
                            browser.Navigating += (ss, ee) => 统计 = "抓取网页中...";
                            browser.DocumentCompleted += (ss, ee) =>
                                {
                                    var text = StripHtml(browser.DocumentText);
                                    var count = Count(text.Replace(" ",""));
                                    统计 = count.ToString(CultureInfo.InvariantCulture);
    
                                    if (LinkParsed != null)
                                    {
                                        if (browser.Document != null)
                                        {
                                            var firstLevelLinks = browser.Document.GetElementsByTagName("A");
                                            var enume = firstLevelLinks.GetEnumerator();
                                            while (enume.MoveNext())
                                            {
                                                var element = enume.Current as HtmlElement;
                                                if (element != null)
                                                {
                                                    var link = element.GetAttribute("href");
                                                    LinkParsed(link);
                                                }
                                            }
                                        }
                                    }
                                    browser.Dispose();
                                };
                            try
                            {
                                browser.Navigate(url);
                            }
                            catch (Exception ex)
                            {
                                统计 = ex.ToString();
                            }
                        }).BeginInvoke(null, null);
                }
    
                private void RefreshProperty(string propertyName)
                {
                    _synchronizationContext.Post(d =>
                       {
                           if (PropertyChanged != null)
                               PropertyChanged(this, new PropertyChangedEventArgs(propertyName));
                       }, null);
                }
    
                private static string StripHtml(string source)
                {
                    string result;
    
                    // Remove HTML Development formatting
                    // Replace line breaks with space
                    // because browsers inserts space
                    result = source.Replace("\r", " ");
                    // Replace line breaks with space
                    // because browsers inserts space
                    result = result.Replace("\n", " ");
                    // Remove step-formatting
                    result = result.Replace("\t", string.Empty);
                    // Remove repeating spaces because browsers ignore them
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                                                                          @"( )+", " ");
    
                    // Remove the header (prepare first by clearing attributes)
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*head([^>])*>", "<head>",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"(<( )*(/)( )*head( )*>)", "</head>",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             "(<head>).*(</head>)", string.Empty,
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // remove all scripts (prepare first by clearing attributes)
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*script([^>])*>", "<script>",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"(<( )*(/)( )*script( )*>)", "</script>",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    //result = System.Text.RegularExpressions.Regex.Replace(result,
                    //         @"(<script>)([^(<script>\.</script>)])*(</script>)",
                    //         string.Empty,
                    //         System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"(<script>).*(</script>)", string.Empty,
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // remove all styles (prepare first by clearing attributes)
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*style([^>])*>", "<style>",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"(<( )*(/)( )*style( )*>)", "</style>",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             "(<style>).*(</style>)", string.Empty,
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // insert tabs in spaces of <td> tags
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*td([^>])*>", "\t",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // insert line breaks in places of <BR> and <LI> tags
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*br( )*>", "\r",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*li( )*>", "\r",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // insert line paragraphs (double line breaks) in place
                    // if <P>, <DIV> and <TR> tags
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*div([^>])*>", "\r\r",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*tr([^>])*>", "\r\r",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<( )*p([^>])*>", "\r\r",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // Remove remaining tags like <a>, links, images,
                    // comments etc - anything that's enclosed inside < >
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<[^>]*>", string.Empty,
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // replace special characters:
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @" ", " ",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"•", " * ",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"‹", "<",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"›", ">",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"™", "(tm)",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"⁄", "/",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"<", "<",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @">", ">",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"©", "(c)",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"®", "(r)",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    // Remove all others. More can be added, see
                    // http://hotwired.lycos.com/webmonkey/reference/special_characters/
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             @"&(.{2,6});", string.Empty,
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // for testing
                    //System.Text.RegularExpressions.Regex.Replace(result,
                    //       this.txtRegex.Text,string.Empty,
                    //       System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    
                    // make line breaking consistent
                    result = result.Replace("\n", "\r");
    
                    // Remove extra line breaks and tabs:
                    // replace over 2 breaks with 2 and over 4 tabs with 4.
                    // Prepare first to remove any whitespaces in between
                    // the escaped characters and remove redundant tabs in between line breaks
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             "(\r)( )+(\r)", "\r\r",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             "(\t)( )+(\t)", "\t\t",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             "(\t)( )+(\r)", "\t\r",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             "(\r)( )+(\t)", "\r\t",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    // Remove redundant tabs
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             "(\r)(\t)+(\r)", "\r\r",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    // Remove multiple tabs following a line break with just one tab
                    result = System.Text.RegularExpressions.Regex.Replace(result,
                             "(\r)(\t)+", "\r\t",
                             System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    // Initial replacement target string for line breaks
                    string breaks = "\r\r\r";
                    // Initial replacement target string for tabs
                    string tabs = "\t\t\t\t\t";
                    for (int index = 0; index < result.Length; index++)
                    {
                        result = result.Replace(breaks, "\r\r");
                        result = result.Replace(tabs, "\t\t\t\t");
                        breaks = breaks + "\r";
                        tabs = tabs + "\t";
                    }
    
                    // That's it.
                    return result;
                }
    
                private static int Count(string text)
                {
                    return Encoding.Unicode.GetCharCount(Encoding.Unicode.GetBytes(text));
                }
    
                public event PropertyChangedEventHandler PropertyChanged;
            }
        }
    }
    



    截图


  • 相关阅读:
    python模块—socket
    mac os系统的快捷键
    教你如何将UIImageView视图中的图片变成圆角
    关于ASP.NET MVC
    iOS 日期格式的转换
    将App通过XCode上传到AppStore 出现这个错误“An error occurred uploading to the iTunes Store”的解决方法
    关于MAC OS下面两个软件的功能改进——Dictionary和Fit 输入法
    分享一下上个星期的香港行程
    【博客园IT新闻】博客园IT新闻 iPhone 客户端发布
    解决Entity Framework Code First 的问题——Model compatibility cannot be checked because the database does not contain model metadata
  • 原文地址:https://www.cnblogs.com/beta2013/p/3377272.html
Copyright © 2011-2022 走看看