zoukankan      html  css  js  c++  java
  • MongoDBcrud操作,采集部分代码

    using System;
    using System.Collections.Generic;
    using System.ComponentModel.Design;
    using System.Linq;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Threading.Tasks;
    using CDPWIB.DAL;
    using CDPWIB.Data;
    using CommonUtility;
    using HtmlAgilityPack;
    using MongoDB.Driver;
    using MongoDB.Driver.Builders;
    using MongoDB.Driver.Linq;
    using Newtonsoft.Json;
    using Newtonsoft.Json.Linq;
    using WebKit;
    
    namespace CDPWIB.WebCollection
    {
        internal class QiDianCol : INovalCollect
        {
            private int Source = Convert.ToInt32(NovalSource.QiDian);
    
            private readonly MongoCollection<NovalTempBase> Novalcol =
                MongoConnectionFactory.GetMongoCollction<NovalTempBase>("Noval", typeof (NovalTempBase));
    
    
            public void GetNovalTypeTemp()
            {
                try
                {
                    var typecol = MongoConnectionFactory.GetMongoCollction<NovalTypeTemp>("Noval", typeof (NovalTypeTemp));
                    var subcol = MongoConnectionFactory.GetMongoCollction<NovalSubType>("Noval", typeof (NovalSubType));
                    // 大类 http://www.qidian.com/Javascript/qidian.bookstore.js?t=20130917
                    string typeshtml =
                        NetHelper.HttpGet("http://www.qidian.com/Javascript/qidian.bookstore.js?t=20130917")
                            .Replace("/", "")
                            .Replace("&nbsp", "")
                            .Replace("
    ", "")
                            .Replace("
    ", "")
                            .Replace("	", "")
                            .Replace("|", "")
                            .Replace(" ", "");
                    ;
                    string subtypes =
                        NetHelper.HttpGet("http://script.cmfu.com/script/BookStore.js ")
                            .Replace("&nbsp", "")
                            .Replace("
    ", "")
                            .Replace("
    ", "")
                            .Replace("	", "")
                            .Replace("|", "")
                            .Replace(" ", "");
                    ;
    
                    Match mtype = Regex.Match(typeshtml, "CategoryArr:(.*?)]]",
                        RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline);
    
                    string typesstring = mtype.Groups[1].Value + "]]";
                    JArray typearr = (JArray) JsonConvert.DeserializeObject(typesstring);
                    //JsonTextWriter
    
                    Match msubtype = Regex.Match(subtypes, "SubCategoryArr=(.*?);",
                        RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline);
    
                    string subtypesstring = msubtype.Groups[1].Value;
    
                    JArray subarr = (JArray) JsonConvert.DeserializeObject(subtypesstring);
    
                    List<NovalTypeTemp> lstypes = new List<NovalTypeTemp>(10);
                    //CategoryArr: [["全部", "-1"], ["玄幻", "21"], ["奇幻", "1"], ["武侠", "2"], ["仙侠", "22"], ["都市", "4"], ["历史", "5"], ["军事", "6"], ["游戏", "7"]
                    for (int i = 0; i < typearr.Count; i++)
                    {
                        if (typearr[i][1].ToString() != "-1")
                        {
                            NovalTypeTemp type = new NovalTypeTemp()
                            {
                                WebNum = typearr[i][1].ToString().ToInt(),
                                Name = typearr[i][0].ToString(),
                                Source = Source
                            };
                            lstypes.Add(type);
                        }
                    }
                    IMongoQuery query = Query<NovalTypeTemp>.EQ(p => p.Source, Source);
    
                    typecol.Remove(query);
    
                    typecol.InsertBatch(lstypes);
                    List<NovalSubType> subtypels = new List<NovalSubType>(300);
    
                    foreach (var NovalTypeTemp in lstypes)
                    {
                     
                        for (int i = 0; i < subarr.Count; i++)
                        {
                            var obj = subarr[i];
                            if (obj[0].ToString() == NovalTypeTemp.WebNum.ToString())
                            {
                                NovalSubType subtype = new NovalSubType()
                                {
                                    Name = obj[2].ToString(),
                                    ParentWebNum = NovalTypeTemp.WebNum,
                                    WebNum = obj[1].ToString().ToInt(),
                                    Source = Source
                                };
                                subtypels.Add(subtype);
                            }
                        }
                       
                    }
                    query = Query<NovalSubType>.EQ(p => p.Source, Source);
                    subcol.Remove(query);
                    subcol.InsertBatch(subtypels);
                }
                catch (Exception ex)
                {
                    throw;
                }
            }
    
            /// <summary>
            /// 根据点击数页面查小说
            /// </summary>
            public void GetNovals()
            {
                //取1到10页
                //得到月点击排行小说。
                string sourcehtml = string.Empty;
                HtmlDocument htmldocc = new HtmlDocument();
                List<NovalTempBase> qdls = new List<NovalTempBase>(500);
                for (int j = 1; j < 11; j++)
                {
                    sourcehtml =
                        NetHelper.HttpGet("http://top.qidian.com/Book/TopDetail.aspx?TopType&Time=2&PageIndex=" + j);
                    ;
                    htmldocc.LoadHtml(sourcehtml);
                    var doc = htmldocc.GetElementbyId("textlist");
                    //string tablehtml = "<table>" + doc.InnerHtml + "</table>";
                    //     htmldocc.LoadHtml(tablehtml);
                    //一页50列
                    for (int i = 2; i < 52; i++)
                    {
                        var trdoc = doc.SelectSingleNode("tr[" + i + "]");
                        //这里的下标,从1算起
                        var tdtype = trdoc.SelectSingleNode("td[2]/a");
                        var tdbook = trdoc.SelectSingleNode("td[3]/a[1]");
                        var tdclick = trdoc.SelectSingleNode("td[4]");
                        var tdauth = trdoc.SelectSingleNode("td[5]/a");
                        Match typematch = Regex.Match(tdtype.OuterHtml, "ChannelId=(\d*?)&SubCategoryId=(\d*?)'");
                        Match bookmatck = Regex.Match(tdbook.OuterHtml, "Book/(\d*?).aspx");
                        Match authmatch = Regex.Match(tdauth.OuterHtml, "id=(\d*?)"");
                        int authid = authmatch.Groups[1].Value.ToInt();
                        int type = typematch.Groups[1].Value.ToInt();
                        int subtype = typematch.Groups[2].Value.ToInt();
                        int booknum = bookmatck.Groups[1].Value.ToInt();
                        string bookname = tdbook.InnerText.Trim();
                        //http://image.cmfu.com/books/3127618/3127618.jpg
                        string titleimg = "http://image.cmfu.com/books/" + booknum + "/" + booknum + ".jpg";
    
                       bool exist= qdls.Exists(p => p.SourceWebNum == booknum);
                        if (!exist)
                        {
                            NovalTempBase qidian = new NovalTempBase()
                            {
                                AuthName = tdauth.InnerText.Trim(),
                                AuthId = authid,
                                SubType = subtype,
                                TitleImg = titleimg,
                                Title = bookname,
                                TotalClick = tdclick.InnerText.ToInt(),
                                TotalComment = 0,
                                Type = type,
                                SourceWebNum = booknum,
                                Source = Source
                            };
                            qdls.Add(qidian);
                        }
                        
                    }
                }
    
                PublicMethod.InsertAndUpdateNovalTmp(qdls,Source);
            }
    
            //public void GetNovalsByType()
            //{
            //}
            /// <summary>
            /// 得到小说章节 ,个别来源,带分卷。
            /// </summary>
            public void GetNovalChapers()
            {
    
                //http://sight.qq.com/book/chapterpage?uin=0&g_tk=5381&callback=_Callback&pagesize=100&pageno=2&bid=16043&_r=0.6934567329008132
                var novalcol = MongoConnectionFactory.GetMongoCollction<NovalTempBase>("Noval", typeof (NovalTempBase));
    
                var books = novalcol.AsQueryable().Where(p=>p.Source==Source).ToList();
                foreach (var infoQidian in books)
                {
                    GetSingleNovalChapers(infoQidian.SourceWebNum);
                }
            }
    
            public void GetSingleNovalChapers(int novalwebnum)
            {
                IMongoQuery q2 = Query<NovalVolumeTemp>.EQ(p => p.Source, Source);
                IMongoQuery q1 = Query<NovalVolumeTemp>.EQ(p => p.NovalWebNum, novalwebnum);
                IMongoQuery[] qarray = { q1, q2 };
    
                IMongoQuery query = Query.And(qarray);
         
                var chaptercol = MongoConnectionFactory.GetMongoCollction<NovalChapterTemp>("Noval", typeof(NovalChapterTemp));
                var volumecol = MongoConnectionFactory.GetMongoCollction<NovalVolumeTemp>("Noval", typeof (NovalVolumeTemp));
                List<NovalChapterTemp> lschapters = new List<NovalChapterTemp>(1000);
                List<NovalVolumeTemp> lsvolumes = new List<NovalVolumeTemp>(10);
                int chapterorder = 1;
                int volumeorder = 1;
                HtmlDocument htmldocc = new HtmlDocument();
                //http://read.qidian.com/BookReader/3127618.aspx
    
                string sourcehtml = string.Empty;
                string url = "http://read.qidian.com/BookReader/" + novalwebnum + ".aspx";
                try
                {
                    sourcehtml = NetHelper.HttpGet(url);
                    //目录主页
                    htmldocc.LoadHtml(sourcehtml);
                    var doc = htmldocc.GetElementbyId("content");
                    int i = 1;
    
                    var topdoc = doc.SelectSingleNode("div[" + i + "]");
                    while (topdoc != null)
                    {
                        var topa = topdoc.SelectSingleNode("div/a");
                        //如果是vip章节,没有这个A标签。
                        int topnum;
                        //分卷信息
                        if (topa != null)
                        {
                            string topahtml = topa.OuterHtml;
                            //href="http://www.qidian.com/BookReader/vol,107580,486625.aspx"
                            Match m = Regex.Match(topahtml, ",(\d*?).aspx");
                            topnum = m.Groups[1].Value.ToInt();
                            var topaname = topdoc.SelectSingleNode("div/b");
                            string topname = topaname.InnerText.Trim();
    
                            topname = topname.Replace("&nbsp", "").Split(';')[1];
                            //if(topname=="作品相关")
                            NovalVolumeTemp volume = new NovalVolumeTemp()
                            {
                                Sort = volumeorder,
                                WebNum = topnum,
                                Name = topname,
                                NovalWebNum = novalwebnum,
                                Source = Source
                            };
                            lsvolumes.Add(volume);
                            volumeorder++;
                        }
                        else
                        {
                            topnum = 0;
                        }
    
                        var contextdoc = doc.SelectSingleNode("div[" + (i + 1) + "]");
                        var chaperas = contextdoc.SelectNodes("div/ul/li/a");
                        //<a itemprop='url' href="http://read.qidian.com/BookReader/107580,20901221.aspx" title='凡人修仙传&#xd;字数:84  更新时间:2008-08-01 07:54:48'><span itemprop='headline'>呵呵!终于上架了!</span></a>
                        //,(d*?).aspx
                        string chaptername = string.Empty;
                        //章节信息
                        int chapterwebnum = 0;
                        for (int x = 0; x < chaperas.Count; x++)
                        {
                            var chapera = chaperas[x];
                            chaptername = chapera.InnerText.Trim();
                            Match chapmatchwebnum = Regex.Match(chapera.OuterHtml, ",(\d*?).aspx");
                            chapterwebnum = chapmatchwebnum.Groups[1].Value.ToInt();
                            NovalChapterTemp chapter = new NovalChapterTemp()
                            {
                                Name = chaptername,
                                Sort = chapterorder,
                                WebNum = chapterwebnum,
                                VolumeId = topnum
                                ,
                                NovalWebNum = novalwebnum,
                                Source = Source
                            };
                            lschapters.Add(chapter);
                            chapterorder++;
                        }
                        i += 2;
                        topdoc = doc.SelectSingleNode("div[" + i + "]");
                    }
                    volumecol.Remove(query);
                    volumecol.InsertBatch(lsvolumes);
                    PublicMethod.InsertChapterTempToSQL(lschapters, Source, novalwebnum);
                
                }
                catch (Exception ex)
                {
                    return;
                }
    
            }
    
    
            public void GetNovalCilckComment()
            {
                var novalcol = MongoConnectionFactory.GetMongoCollction<NovalTempBase>("Noval", typeof(NovalTempBase));
    
                var books = novalcol.AsQueryable().Where(p => p.Source == Source).ToList();
                string sourcehtml = string.Empty;
                string url = string.Empty;
                HtmlDocument htmldocc = new HtmlDocument();
                foreach (var novalTempBase in books)
                {
                    //http://www.qidian.com/Book/3106580.aspx
                     url = "http://www.qidian.com/Book/" + novalTempBase.SourceWebNum + ".aspx";
                    sourcehtml = NetHelper.HttpGet(url);
                    htmldocc.LoadHtml(sourcehtml);
                    var cliclickdiv = htmldocc.GetElementbyId("contentdiv");
                    // /div/div/div[1]/table/tbody/tr/td[1]
    
                    var clickcount =
                        cliclickdiv.SelectSingleNode("div/div[1]/table/tr/td[1]")
                            .InnerText.Replace("总点击", "")
                            .Replace("", "").Trim();
    
                    int click = Convert.ToInt32(clickcount);
                  
                //    string urlcom = "http://forum.qidian.com/NewForum/List.aspx?BookId=3106580";
                ////http://forum.qidian.com/NewForum/List.aspx?BookId=3106580
              
    
                // //   http://c.pingba.qidian.com/BookComment.aspx?BookId=3106580
                //    url = "http://c.pingba.qidian.com/BookComment.aspx?" + novalTempBase.SourceWebNum;
                //    sourcehtml = NetHelper.HttpGet(url);
                //    htmldocc.LoadHtml(sourcehtml);
                    novalTempBase.TotalClick = click;
                    novalcol.Save(novalTempBase);
                }
                
                 
              
               
                
                   
                    //目录主页
                  
                 
    
            }
    
        
        }
    }
  • 相关阅读:
    android view生命周期
    ViewPager 滑动页(四)
    android 中如何获取camera当前状态
    Android LayoutInflater原理分析,带你一步步深入了解View(一)
    仿Twitter登陆移动背景效果
    Android应用性能优化之使用SQLiteStatement优化SQLite操作
    GreenDao官方文档翻译(下)
    高级IO
    linux信号
    LINUX进程
  • 原文地址:https://www.cnblogs.com/zihunqingxin/p/4022572.html
Copyright © 2011-2022 走看看