zoukankan      html  css  js  c++  java
  • 新闻采集源码可自写规则

    using System;
    using System.Data;
    using System.Configuration;
    using System.Collections;
    using System.Web;
    using System.Web.Security;
    using System.Web.UI;
    using System.Web.UI.WebControls;
    using System.Web.UI.WebControls.WebParts;
    using System.Web.UI.HtmlControls;
    using System.Text.RegularExpressions;
    using System.Collections;
    using System.IO;
    using System.Net;
    using System.Text;
    namespace NewsCollection
    {
        public partial class SiteEdit : System.Web.UI.Page
        {
            string urlData = "";
            protected void Page_Load(object sender, EventArgs e)
            {
                this.BtCollection.Attributes.Add("onclick", "Status.showInfo('加载中');");
                if (!Page.IsPostBack)
                {
                    AjaxAction();
                    BindData();
                }
            
            }
            public string GetRequest(string key)
            {
                key = Convert.ToString(Request[key]??"");
                key = key==null?(""):(key);
                return key;
            }
            public void AjaxAction()
            {
                string isAjax = GetRequest("isAjax").ToLower();
                if (isAjax == "true")
                {
                    string state = "";
                    string action = this.GetRequest("action").ToLower();
                    string values = this.GetRequest("values");
                    if (action == "newsbody")
                    {
                        string modelstart = this.GetRequest("modelstart");
                        string modelend = this.GetRequest("modelend");
                        string modelbody = this.GetRequest("modelbody");
                        string siteUrl = this.GetRequest("siteUrl");
                        ArrayList al = this.GetModelData(modelstart, modelend, modelbody, siteUrl);
                        StringBuilder sb = new StringBuilder();
                        foreach (string s in al)
                        {
                            sb.Append(s);
                        }
                        state = sb.ToString();
                    }
                    else if (action == "newsdetail")
                    {
                        string modelstart = this.GetRequest("modelstart");
                        string modelend = this.GetRequest("modelend");
                        string modelbody = this.GetRequest("modelbody");
                        string siteUrl = this.GetRequest("siteUrl");
                        string newsTitleStart = this.GetRequest("newsTitleStart");
                        string newsTitleEnd = this.GetRequest("newsTitleEnd");
                        string newsContentStart = this.GetRequest("newsContentStart");
                        string newsContentEnd = this.GetRequest("newsContentEnd");
                        state = GetNews(GetModelData(modelstart, modelend, modelbody, siteUrl), newsTitleStart, newsTitleEnd, newsContentStart, newsContentEnd);
                    }
                    else
                    {
                        state = "test Ajax";
                    }
                    Response.Clear();
                    Response.Write(state);
                    Response.End();
                }
            }
            public void BindData()
            {
                string Gid = Convert.ToString(Request["Gid"] ?? "");
                if (Gid.Length > 0)
                {
                    Beans.Sites sites = new Beans.Sites();
                    sites.Gid = Gid;
                    sites = sites.SelectById();
                    TbSiteName.Text  = sites.SiteName;
                    TbSiteUrl.Text =sites.SiteUrl;
                    TbSiteModelStart.Text = sites.SiteModelStart;
                    TbSiteModelEnd.Text =sites.SiteModelEnd;
                    TbSiteModelBody.Text = sites.SiteModelBody;
                    TbNewsTitleStart.Text =sites.NewsTitleStart;
                    TbNewsTitleEnd.Text =sites.NewsTitleEnd;
                    TbNewsContentStart.Text =sites.NewsContentStart;
                    TbNewsContentEnd.Text =sites.NewsContentEnd;
                }
            }
            protected void BtEdit_Click(object sender, EventArgs e)
            {
                string message = "系统错误请重试";
                string script = "history.go(-1)";
                string Gid = Convert.ToString(Request["Gid"]??"");
                Beans.Sites sites = new Beans.Sites();
                sites.SiteName = TbSiteName.Text.Trim();
                sites.SiteUrl = TbSiteUrl.Text.Trim();
                sites.SiteModelStart = TbSiteModelStart.Text.Trim();
                sites.SiteModelEnd = TbSiteModelEnd.Text.Trim();
                sites.SiteModelBody = TbSiteModelBody.Text.Trim();
                sites.NewsTitleStart = TbNewsTitleStart.Text.Trim();
                sites.NewsTitleEnd = TbNewsTitleEnd.Text.Trim();
                sites.NewsContentStart = TbNewsContentStart.Text.Trim();
                sites.NewsContentEnd = TbNewsContentEnd.Text.Trim();
                if (Gid.Length > 0)
                {
                    sites.Gid=Gid;
                    if (sites.Update())
                    {
                        message = "修改成功!";
                    }
                 
                }
                else
                {
                    if (sites.Add())
                    {
                        message = "添加成功!";
                    }
                  
                }
                Response.Write("<script type='text/javascript'>alert('"+message+"');"+script+"</script>");
             
            }

            protected void BtCollection_Click(object sender, EventArgs e)
            {
                GetNews(GetModelData(TbSiteModelStart.Text.Replace("(", "\\(").Replace(")", "\\)").Replace(".", "\\."), TbSiteModelEnd.Text.Replace("(", "\\(").Replace(")", "\\)").Replace(".", "\\."), TbSiteModelBody.Text, TbSiteUrl.Text), TbNewsTitleStart.Text.Trim(), TbNewsTitleEnd.Text.Trim(), TbNewsContentStart.Text.Trim(), TbNewsContentEnd.Text.Trim());
         
               // GetNews(GetModelData(TbSiteModelStart.Text.Replace("(", "\\(").Replace(")", "\\)").Replace(".", "\\."), TbSiteModelEnd.Text.Replace("(", "\\(").Replace(")", "\\)").Replace(".", "\\."), TbSiteModelBody.Text, TbSiteUrl.Text));
            }
            public ArrayList GetModelData(string modelstart,string modelend,string modelbody,string SiteUrl)
            {

                ArrayList al = new ArrayList();
                string content = GetHttpData(SiteUrl,"gb2312");
                Regex reg = new Regex(modelstart+"(?<newsBody>[\\s\\S]*)"+modelend, RegexOptions.IgnoreCase | RegexOptions.Multiline);
                Match mat = reg.Match(content);
                //TbContent.Text = mat.Groups["newsBody"].Value.ToString();
                Regex regurl = new Regex(modelbody.Replace("_url_", "(?<url>[^\"'\\s]+)"), RegexOptions.IgnoreCase | RegexOptions.Singleline);
                Match maturl = regurl.Match(mat.Groups["newsBody"].Value.ToString());
                while (maturl.Success)
                {
                    //z-zA-A0-9/\\.:
                    string temp = maturl.Groups["url"].Value;
                    al.Add(temp);
                    Response.Write(temp.StartsWith("http://") ? (temp) : (temp.Insert(0, "http://" + SiteUrl.Replace("http://","").Substring(0, SiteUrl.LastIndexOf("/")))) + "<br>");
                    maturl = maturl.NextMatch();
                }
                return  al;
            }
        
            public string  GetNews(ArrayList al,string titleStart,string titleEnd,string contentStart,string contetnEnd)
            {
                StringBuilder sb = new StringBuilder();
                if (al != null)
                {
                
                    foreach (string s in al)
                    {

                        Regex reg = new Regex(titleStart.Replace("(", "\\(").Replace(")", "\\)") + "(?<title>[^<]*)" + titleEnd.Replace("(", "\\(").Replace(")", "\\)") + "[\\s\\S]+" + contentStart.Replace("(", "\\(").Replace(")", "\\)") + "(?<content>[\\s\\S]+)" + contetnEnd.Replace("(", "\\(").Replace(")", "\\)"), RegexOptions.IgnoreCase | RegexOptions.Multiline);
                        Match mat = reg.Match(GetHttpData(s, "gb2312"));
                        //Response.Write(string.Format("news:{0}<br>content:{1}<br>",mat.Groups["title"].Value,mat.Groups["content"].Value));
                        Beans.News news = new Beans.News();
                        news.Title = mat.Groups["title"].Value;
                        news.Typeid = Convert.ToString(Request["Gid"]);//mat.Groups["title"].Value;
                        news.From = this.TbSiteName.Text;//mat.Groups["content"].Value;
                        news.Content = mat.Groups["content"].Value;
                        news.Add();
                        sb.AppendFormat("title:{0}", mat.Groups["title"].Value);
                    }
                }
                return sb.ToString();
            }
            public string GetHttpData(string sUrl, string encoding)
            {
                string sRslt = null;
                WebResponse oWebRps = null;
                WebRequest oWebRqst = WebRequest.Create(sUrl);
                oWebRqst.Timeout = 50000;
                try
                {
                    oWebRps = oWebRqst.GetResponse();
                }

                finally
                {
                    if (oWebRps != null)
                    {
                        StreamReader oStreamRd = new StreamReader(oWebRps.GetResponseStream(), System.Text.Encoding.GetEncoding(encoding));
                        sRslt = oStreamRd.ReadToEnd();
                        oStreamRd.Close();
                        oWebRps.Close();
                    }
                }
                return sRslt;
            }

        

       
        }
    }

  • 相关阅读:
    angular 项目 error TS2451: Cannot redeclare block-scoped variable 'ngDevMode'
    chrome 总崩溃的正确解决方法
    angular 学习日志
    mongodb 3.4 学习 (二)命令
    mongodb 3.4 学习 (一) 安装
    python中文入库
    [转贴] 流量统计脚本
    监控系统开发的一些参考
    nagios centos7 rpm打包
    collectd配置
  • 原文地址:https://www.cnblogs.com/bestsaler/p/1835568.html
Copyright © 2011-2022 走看看