zoukankan      html  css  js  c++  java
  • C# RSS:新闻抓取正文并转TXT

    如果你喜欢手机阅读

    如果宿舍手机几乎没CMNET信号

    如果你不想浪费手机流量

    如果你只想睡前静静浏览今天的新闻

    以下程序抓取了 cnblogs,cnbeta,网易深度,南方周末的首页正文,可添加其它网站

     
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Net;
    using System.Collections;
    using System.Threading;
    using System.IO;
    using System.Configuration;
    namespace RSS
    {
        class Program
        {
          
            static void Main(string[] args)
            {
                string file="i://";
    
                {
                    GetItem gi1 = new GetItem();
                    gi1.pageUrl = "http://news.cnblogs.com/n/page/";
                    gi1.prefix = "http://news.cnblogs.com";
                    gi1.pageUrlsRegex = "\"(?<url>/n/[\\d]+?)\"";
                    gi1.titleRegex = "<div id=\"news_title\"><a.*?>(?<title>.*?)</a>";
                    gi1.timeRegex = "<span class=\"time\">(?<time>.*?)</span>";
                    gi1.bodyRegex = "<div id=\"news_body\">(?<body>.*?)</div>";
                    gi1.hostName = "CnBlogs";
                    gi1.encoding = "utf-8";
                    gi1.fileSave = string.Format("{2}{0}_{1}.txt", gi1.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);
                    Console.WriteLine(gi1.fileSave);
                    gi1.pageWantToGet = 20;
                    gi1.threadStart();
    
                }
    
                //{
                //    GetItem gi2 = new GetItem();
                //    gi2.prefix = "http://www.cnbeta.com/";
                //    gi2.pageUrlsRegex = "\"(?<url>/articles/[\\d]+.htm?)\"";
                //    gi2.titleRegex = "id=\"news_title\">(?<title>.*?)</h3>";
                //    gi2.timeRegex = "id=\"news_author\"><span>(?<time>.*?)[|]";
                //    gi2.bodyRegex = "<div id=\"news_content\">(?<body>.*?)<!-- end newsBox news -->";
                //    gi2.hostName = "CnBeta";
                //    gi2.encoding = "gb2312";
                //    gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);
                //    Console.WriteLine(gi2.fileSave);
                //    gi2.homeOnly = true;
                //    gi2.threadStart();
    
                //}
    
                //{
                //    GetItem gi2 = new GetItem();
                //    gi2.pageUrlsRegex = "\"(?<url>http://focus.news.163.com.[^>< ]*.html?)\"";
                //    gi2.prefix = "http://focus.news.163.com/";
                //    gi2.hasPrefix = false;//default:true
                //    gi2.hasManyPage = true;//default:false
                //    gi2.manyPageRegex = "<span class=\"s1 s3\">上一页</span>(?<np>.*?)下一页</a>";
                //    gi2.titleRegex = "id=\"h1title\">(?<title>.*?)</h1>";
                //    gi2.timeRegex = "<span class=\"info\">(?<time>.*?)来源";
                //    gi2.bodyRegex = "class=\"summary\"(?<body>.*?)<!-- 分页 -->";
                //    gi2.hostName = "163";
                //    gi2.encoding = "GBK";
                //    gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);
                //    Console.WriteLine(gi2.fileSave);
                //    gi2.homeOnly = true;
                //    gi2.threadStart();
    
                //}
                //{
                //    GetItem gi2 = new GetItem();
                //    gi2.pageUrlsRegex = "\"(?<url>http://www.infzm.com/content/[\\d]+?)\"";
                //    gi2.prefix = "http://www.infzm.com/";
                //    gi2.hasPrefix = false;//default:true
                //    gi2.hasManyPage = false;//default:false
                //    //gi2.manyPageRegex = "<span class=\"s1 s3\">上一页</span>(?<np>.*?)下一页</a>";
                //    gi2.titleRegex = "<div id=\"detailContent\">[\\s]*<h1>[\\s]*(?<title>.*?)[\\s]*</h1>";
                //    gi2.timeRegex = "<span class=\"pubTime\">(?<time>.*?)</span>";
                //    gi2.bodyRegex = "<div id=\"content-context\">(?<body>.*?)<!--end #text-->";
                //    gi2.hostName = "infzm";
                //    gi2.encoding = "utf-8";
                //    gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);
                //    Console.WriteLine(gi2.fileSave);
                //    gi2.homeOnly = true;
                //    gi2.threadStart();
    
                //}
                //Console.Read();
                
            }
        }
        class GetItem{
            public string pageUrl;
            public bool homeOnly = false;
            public bool hasPrefix = true;
            public int pageWantToGet = 1;
            public bool hasManyPage = false;
            public string manyPageRegex;
            public string prefix;
            private List<string> pageUrls;
            public string pageUrlsRegex;
            public string titleRegex;
            public string timeRegex;
            public string bodyRegex;
            public string fileSave;
            public string hostName;
            public string encoding;
            public void threadStart() {
    
                if(!prefix.EndsWith("/"))prefix+="/";
                ThreadStart ts = new ThreadStart(start);
                Thread th = new Thread(ts);
                th.Start();
                
            }
            private void start() {
    
                if (homeOnly) { 
                    
                    getPageUrls(-1);
    
                }
                else
                {
    
                    for (int i = 1; i <= pageWantToGet; i++)
                        getPageUrls(i);
                }
                startGetAll();
            }
            private void WriteFile(string str) {
                FileStream fs = new FileStream(fileSave, FileMode.Append);
                StreamWriter streamWriter = new StreamWriter(fs,System.Text.Encoding.GetEncoding("gb2312"));
                streamWriter.WriteLine(str);
                streamWriter.Flush();
                streamWriter.Close();
                fs.Close();
            }
            private void deleteTag(ref string str)
            {
       
                str = Regex.Replace(str, "<[\\s]*p[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*p[\\s]*?>", "\r\n");
                str = Regex.Replace(str, "<[\\s]*br[\\s]*/[\\s]*[^>]*>?>", "\r\n");
                str = Regex.Replace(str, "<[\\s]*br[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*br[^>]*>?>", "\r\n");
                
                str = Regex.Replace(str, "<[\\s]*a[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*a[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "<[\\s]*img[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*img[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "<[\\s]*strong[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*strong[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "<[\\s]*div[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*div[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "<[\\s]*b[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*b[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "<[\\s]*span[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*span[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "<[\\s]*script[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*script[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "<[\\s]*li[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*li[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "<[\\s]*img[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*img[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "<[\\s]*style[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*style[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "<[\\s]*i[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*i[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "<[\\s]*h3[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "<[\\s]*h2[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*h3[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*h2[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "<[\\s]*font[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*font[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "<[\\s]*q[\\s]*[^>]*>?>", "");
                str = Regex.Replace(str, "</[\\s]*q[\\s]*[^>]*>?>", "");
                str = str.Replace("&rdquo;", "\"");
                str = str.Replace("&ldquo;", "\"");
                str = str.Replace("&lsquo;", "'");
                str = str.Replace("&rsquo;", "'");
                str = str.Replace("&nbsp;", " ");
                str = str.Replace("&hellip;", "");
                str = str.Replace("&ndash;", "-");
                str = str.Replace("&mdash;", "");
            }
            public GetItem()
            {
                //this.homeUrl = url;
                pageUrls = new List<string>(50);
            }
            private string getNextPageContent(string url) {
    
                Console.WriteLine(url);
    
                //Console.Read();
    
                try
                {
                    HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);
                    //req.SendChunked = true;
                    req.Method = "get";
                    req.ContentType = "text/html;charset=utf-8";
    
                    //req.AllowAutoRedirect = false;
                    // req.Timeout = 50;
                    //req.CookieContainer = cc;
    
    
                    StringBuilder sb = new StringBuilder("");
                    StringBuilder cont = new StringBuilder("");
                    using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse)
                    {
    
                        System.IO.Stream respStream = wr.GetResponseStream();
                        System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding));
                        //Regex titler = new Regex(this.titleRegex, RegexOptions.Singleline);
                        //Regex timer = new Regex(this.timeRegex, RegexOptions.Singleline);
                        Regex bodyr = new Regex(this.bodyRegex, RegexOptions.Singleline);
                        do
                        {
    
                            sb.Append(reader.ReadLine());
    
    
    
                        } while (!reader.EndOfStream);
    
                        string str = sb.ToString();
                        //Console.WriteLine(sb);
                        //Match m = titler.Match(str);
                        //if (m.Success)
                        //{
                        //    Console.WriteLine("title:{0}", m.Groups["title"].Value);
                        //    //streamWriter.WriteLine(m.Groups["title"].Value);
                        //    cont.AppendLine(m.Groups["title"].Value);
    
                        //}
                        //cont.AppendLine(url);
                        //m = timer.Match(str);
                        //if (m.Success)
                        //{
                        //    Console.WriteLine("time:{0}", m.Groups["time"].Value);
                        //    cont.AppendLine(m.Groups["time"].Value);
                        //}
                        Match m = bodyr.Match(str);
                        if (m.Success)
                        {
                            string body = m.Groups["body"].Value;
    
                            deleteTag(ref body);
                            Console.WriteLine("已获取下一页正文");
                            return body;
                        }
                       
    
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine("异常:{0}",ex.Message);
                    return "";
                }
                return "";
            
            
            
            }
            private void getContent(string url,int index,int total)
            {
                Console.WriteLine(url);
    
                //Console.Read();
                
                try
                {
                    HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);
                    
                    req.Method = "get";
                    req.ContentType = "	text/html;charset=utf-8";
                    
                    //req.
                    //req.AllowAutoRedirect = false;
                    // req.Timeout = 50;
                    //req.CookieContainer = cc;
    
    
                    StringBuilder sb = new StringBuilder("");
                    StringBuilder cont = new StringBuilder("");
                    using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse)
                    {
                       
                        System.IO.Stream respStream = wr.GetResponseStream();
                        System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding));
                        Regex titler = new Regex(this.titleRegex, RegexOptions.Singleline);
                        Regex timer = new Regex(this.timeRegex, RegexOptions.Singleline);
                        Regex bodyr = new Regex(this.bodyRegex, RegexOptions.Singleline);
                        do
                        {
    
                            sb.Append(reader.ReadLine());
    
    
    
                        } while (!reader.EndOfStream);
    
                        string str = sb.ToString();
                        //Console.WriteLine(sb);
                        Match m = titler.Match(str);
                        if (m.Success) {
                            Console.WriteLine("title:{0}",m.Groups["title"].Value);
                            //streamWriter.WriteLine(m.Groups["title"].Value);
                            cont.AppendLine(m.Groups["title"].Value);
                            
                        }
                        cont.AppendLine(string.Format("({0}/{1}){2}",index,total,url));
                        m = timer.Match(str);
                        if (m.Success) {
                            Console.WriteLine("time:{0}", m.Groups["time"].Value);
                            cont.AppendLine(m.Groups["time"].Value);
                        }
                        m = bodyr.Match(str);
                        if (m.Success)
                        {
                            string body = m.Groups["body"].Value;
    
                            deleteTag(ref body);
                            Console.WriteLine("获取正文");
                            cont.AppendLine(body);
                        }
                        if (hasManyPage) {
                            
                            Regex mr = new Regex(this.manyPageRegex, RegexOptions.Singleline);
                            Match mm = mr.Match(str);
                            if (mm.Success) {
                                Console.WriteLine("存在多页..");
                                string pagesurl = mm.Groups["np"].Value;
                                Regex r = new Regex(this.pageUrlsRegex, RegexOptions.Singleline);
                                MatchCollection mc = r.Matches(pagesurl);
                                for (int i = 0; i < mc.Count; i++) {
                                    string u = mc[i].Groups["url"].Value;
                                    if (pageUrls.IndexOf(u) == -1) {
    
                                        pageUrls.Add(u);
                                        cont.AppendLine(getNextPageContent(u));
                                    }
                                }
                            
                            }
                        
                        }
                        cont.AppendLine("--------------------------------------------------------------");
                        WriteFile(cont.ToString());
                      
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine("异常:{0},{1}",ex.Source,ex.Message);
                    return;
                }
                
            
            }
            private void startGetAll() {
    
                for (int i = 0; i < pageUrls.Count; i++)
                {
                    string u;
                    if (hasPrefix)
                    {
    
    
    
                        if (pageUrls[i].StartsWith("/"))
                            u = string.Format("{0}{1}", prefix, pageUrls[i].Substring(1));
                        else u = string.Format("{0}{1}", prefix, pageUrls[i]);
    
                    }
                    else u = pageUrls[i];
    
    
                    getContent(u, i, pageUrls.Count);
    
                }
            }
            private void getPageUrls(int pageIndex)
            {
                string url;
                if (pageIndex == -1) url = prefix;
                else url = string.Format("{0}{1}",this.pageUrl,pageIndex);
                Console.WriteLine(url);
                try
                {
                    HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);
                    req.Method = "get";
                    req.ContentType = "	text/html;charset=utf-8";
    
                    //req.AllowAutoRedirect = false;
                    // req.Timeout = 50;
                    //req.CookieContainer = cc;
    
    
                    StringBuilder sb = new StringBuilder("");
                    using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse)
                    {
    
                        System.IO.Stream respStream = wr.GetResponseStream();
                        System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding));
                        Regex r = new Regex(this.pageUrlsRegex, RegexOptions.Singleline);
                        
                        do
                        {
                            
                            sb.Append(reader.ReadLine());
    
    
    
                        } while (!reader.EndOfStream);
    
    
                       // Console.WriteLine(sb);
                        MatchCollection m = r.Matches(sb.ToString());
                        //Console.WriteLine("regex:{0},matches:{1}", this.pageUrlsRegex, m.Count);
                        for (int i = 0; i < m.Count; i++) {
                            string temp = m[i].Groups["url"].Value;
                            //Console.WriteLine("index:{0},{1}", pageUrls.IndexOf(temp), temp);
                            if (pageUrls.IndexOf(temp) == -1) pageUrls.Add(temp);
                        }
                        Console.WriteLine("{0}:{1} articles.",this.hostName,pageUrls.Count);
                        
                 
                        
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.Message);
                    Console.WriteLine("{0} end!", this.hostName);
                    return;
                }
                Console.WriteLine("{0} end!", this.hostName);
            }
        }
    }



    注:project->add item->new xml file:app.config
    like this:

    <?xml version="1.0" encoding="utf-8" ?>
    <configuration>
      <system.net>
        <settings>
          <httpWebRequest  useUnsafeHeaderParsing= "true"  />
        </settings>
      </system.net>
    </configuration>


  • 相关阅读:
    5G网络逐渐普及TSINGSEE青犀视频云边端架构网页视频实时互动直播系统又将如何发展?
    【开发记录】TSINGSEE青犀视频云边端架构Visual Studio 2017自建WebRTC中peerconnection_client编译报无法解析错误
    安防视频云服务平台EasyCVR视频智能分析系统运行控制台报404错误如何排查?
    一对一或一对多音视频通话会议系统可以通过哪些方式实现?
    TSINGSEE青犀视频云边端视频智能分析平台开发VMware下安装Ubuntu系统后无法安装VMwaretools问题解决
    最简单的Windows套接字(Socket)例子(源码,实例)
    KJAVA虚拟机Hack笔记实现MIDP的SLAVE事件模型
    系统程序员成长计划你的数据放在哪里(下)
    使用new实现realloc操作
    KJava虚拟机hack笔记基于GTK的移植
  • 原文地址:https://www.cnblogs.com/yangyh/p/2046990.html
Copyright © 2011-2022 走看看