zoukankan      html  css  js  c++  java
  • 采集新闻

    面向对象的方式做采集程序
    采集新闻
    方便扩展
    存储到xml


    步骤:
    1、找对象 文章视为对象 每一个网站视为对象
    2、为了方便扩展做类似于计算器的操作
    把采集的网站视为对象,所有的网站都能够采集 和保存成xml
    所以抽象出父类WebSite 抽象类
    实现具体的子类cnbeta sina等
    3、WebSite 抽象类{ Name(网站名字 只读) Path xml保存路径 Url采集的url,抽象方法 Load采集新闻,Save把新闻保存到xml中}
    4、cnbeta 继承WebSite{ }
    donews
    5、窗体加载时候根据反射读取每个继承自WebSite的子类的名字,添加到下拉框中
    6、点采集按钮时候。根据下拉框中的内容创建具体的子类,执行采集方法
    7、点保存按钮的时候 把采集到的新闻集合,存储在xml中

    cnBate
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.Text.RegularExpressions;
    using System.Xml;
    
    namespace 采集程序3
    {
        class cnBate : WebSite
        {
            public override string name
            {
                get { return "cnBate"; }
            }
            List<Article> articles = new List<Article>();
            string regex = @"<div\s+class=""newslist"">\s+<dl>.+?<strong>(?<title>.+?)</strong></a>.+?<span>(?<author>.+?)发布于\s+(?<time>\d{4}\-\d{2}\-\d{2}\s+?\d{2}:\d{2}:\d{2}).+?</a>.+?<span>(?<content>.+?)</span></dd>";
            public override List<Article> Load()
            {
                WebClient wc = new WebClient();
                using (Stream stream = wc.OpenRead(base.Url))
                {
                    using (StreamReader sr = new StreamReader(stream, Encoding.GetEncoding("gb2312")))
                    {
                        string content;
                        while (!string.IsNullOrEmpty((content = sr.ReadToEnd())))
                        {
                            MatchCollection mc = Regex.Matches(content, regex, RegexOptions.Singleline);
                            foreach (Match match in mc)
                            {
                                if (match.Success)
                                {
                                    Article article = new Article();
                                    article.Title = match.Groups["title"].Value;
                                    article.Author = match.Groups["author"].Value;
                                    article.Content = match.Groups["content"].Value;
                                    article.Content = Regex.Replace(article.Content, "<.+?>", "");
                                    article.Time = DateTime.Parse(match.Groups["time"].Value);
                                    articles.Add(article);
                                }
                            }
                        }
                    }
                }
                return articles;
            }
    
            public override void Save()
            {
                if (!File.Exists(base.Path))
                {
                    CreateXml();
                }
                else
                {
                    AddXml();
                }
            }
    
            public void CreateXml()
            {
                XmlDocument doc = new XmlDocument();
                XmlDeclaration declaration = doc.CreateXmlDeclaration("1.0", "utf-8", null);
                doc.AppendChild(declaration);
                XmlElement parent = doc.CreateElement("News");
                doc.AppendChild(parent);
    
                foreach (Article item in articles)
                {
                    XmlElement child = doc.CreateElement("New");
                    parent.AppendChild(child);
                    CreateItems(doc, child, item.Title, "Title");
                    CreateItems(doc, child, item.Author, "Author");
                    CreateItems(doc, child, item.Content, "Content");
                    CreateItems(doc, child, item.Time.ToString(), "Time");
                }
                doc.Save(base.Path);
            }
    
            public void AddXml()
            {
                XmlDocument doc = new XmlDocument();
                doc.Load(base.Path);
                XmlElement parent = doc.DocumentElement;
    
                
                foreach (Article item in articles)
                {
                    XmlElement child = doc.CreateElement("New");
                    parent.AppendChild(child);
                    CreateItems(doc, child, item.Title, "Title");
                    CreateItems(doc, child, item.Author, "Author");
                    CreateItems(doc, child, item.Content, "Content");
                    CreateItems(doc, child, item.Time.ToString(), "Time");
                }
                doc.Save(base.Path);
            }
    
            private static void CreateItems(XmlDocument doc, XmlElement child, string item, string str)
            {
                XmlElement title = doc.CreateElement(str);
                title.InnerText = item;
                child.AppendChild(title);
            }
        }
    }
    WebSite
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    
    namespace 采集程序3
    {
        public abstract class WebSite
        {
            public abstract string name
            {
                get;
            }
            private string url;
    
            public string Url
            {
                get { return url; }
                set { url = value; }
            }
            private string path;
    
            public string Path
            {
                get { return path; }
                set { path = value; }
            }
    
            public abstract List<Article> Load();
            public abstract void Save();
        }
    }
    Article
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    
    namespace 采集程序3
    {
        public  class Article
        {
            private string title;
    
            public string Title
            {
                get { return title; }
                set { title = value; }
            }
            private string author;
    
            public string Author
            {
                get { return author; }
                set { author = value; }
            }
            private string content;
    
            public string Content
            {
                get { return content; }
                set { content = value; }
            }
            private DateTime time;
    
            public DateTime Time
            {
                get { return time; }
                set { time = value; }
            }
    
     
        }
    }
    Fectory
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    
    namespace 采集程序3
    {
        class Fectory
        {
            public static WebSite CreateObj(string type)
            {
                WebSite site = null;
                switch (type)
                {
                    case "cnBate":
                        site = new cnBate();
                        site.Path = "cnBate.xml";
                        site.Url = @"http://www.cnbeta.com/";
                        break;
                }
                return site;
            }
        }
    }
    Form1
    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.Linq;
    using System.Text;
    using System.Windows.Forms;
    using System.Reflection;
    
    namespace 采集程序3
    {
        public partial class Form1 : Form
        {
            public Form1()
            {
                InitializeComponent();
            }
    
            private void Form1_Load(object sender, EventArgs e)
            {
                //使用反射动态添加列表项,方便扩展
                Assembly ass = sender.GetType().Assembly;
                Type[] types= ass.GetTypes();
                foreach (Type type in types)
                {
                    if (typeof(WebSite).IsAssignableFrom(type) && !type.IsAbstract)
                    {
                        WebSite ws = Activator.CreateInstance(type) as WebSite;
                        comboBox1.Items.Add(ws.name);
                    }
                }
            }
            WebSite ws;
            private void btnLoad_Click(object sender, EventArgs e)
            {
                ws = Fectory.CreateObj(comboBox1.Text);
                if (ws != null)
                {
                    List<Article> articles = new List<Article>();
                    articles = ws.Load();
                    foreach (Article item in articles)
                    {
                        ListViewItem lvi = new ListViewItem(item.Title);
                        lvi.SubItems.Add(item.Author);
                        lvi.SubItems.Add(item.Content);
                        lvi.SubItems.Add(item.Time.ToString());
                        listView1.Items.Add(lvi);
                    }
                }
                else
                {
                    MessageBox.Show("该选项不存在");
                }
            }
    
            private void btnSave_Click(object sender, EventArgs e)
            {
                ws.Save();
                MessageBox.Show("保存成功");
            }
    
            private void listView1_DoubleClick(object sender, EventArgs e)
            {
                MessageBox.Show(listView1.SelectedItems[0].SubItems[0].Text);
            }
        }
    }

  • 相关阅读:
    类似最长递增子序,记忆化DP—— Codeforces Beta Round #4 (Div. 2 Only)D Mysterious Present
    最小逆序数对——hdu1394
    区间更新 求总区间——hdu1754
    抽象类 虚函数实现
    poj2271
    poj2246
    poj2410
    poj2567
    poj2247
    Integration Services 学习(7):包部署 (转自游子吟)
  • 原文地址:https://www.cnblogs.com/hejinyang/p/2818416.html
Copyright © 2011-2022 走看看