zoukankan      html  css  js  c++  java
  • Collector.cs


    using System;
    using System.Collections.Generic;
    using System.Text;
    using System.Collections;
    using System.Net;
    using System.IO;
    using System.Data;
    using System.Data.SqlClient;

    using CnBlogCollector.Properties;

    namespace CnBlogCollector
    {
        /// <summary>
        /// 数据采集类
        /// </summary>
        public class Collector
        {
           #region 变量
            private string cnblogMain = "http://www.cnblogs.com/p{0}";//cnblog首页地址
            private WebClient wc = new WebClient();
            #endregion


           #region 创建目录
            /// <summary>
            /// 判断目录是否存在,若不存在则创建该目录
            /// </summary>
            /// <param name="path"></param>
            /// <returns></returns>
            public string CreateFolderIfNot(string path)
            {
                //获取该目录的完整路径
                string rtn = Path.GetFullPath(path);
                //若该目录不存在
                if (!Directory.Exists(rtn))
                {
                    //创建该目录
                    Directory.CreateDirectory(rtn);
                }
                return rtn;
            }
            #endregion

           #region 采集网页数据
           public void Gather(int startIndex, int endIndex)
           {

               SqlConnection con = new SqlConnection(@"Data Source=.;Initial Catalog=WordProject;User ID=misp2;Password=misp2;");
               con.Open();
               //根据startIndex和endIndex来遍历cnblog首页上文章
               for (int i = startIndex; i < endIndex; i++)
               {
                   //从cnblog首页下载页面数据并将其转换成UTF8编码格式的STRING
                   string mainData = Encoding.UTF8.GetString(wc.DownloadData(string.Format(cnblogMain, i.ToString())));


                   int j = 1;
                   //二次遍历抓取cnblog首页面上的文章链接,并顺着这些链接进入文章页面采集数据
                   while (mainData.IndexOf("<a class=\"titlelnk\" href=\"") >= 0)
                   {
                       try
                       {
                           mainData = mainData.Substring(mainData.IndexOf("<a class=\"titlelnk\" href=\"") + 26);

                           //获取文章页面的链接地址
                           string articleAddr = mainData.Substring(0, mainData.IndexOf("\""));

                           //获取文章标题
                           string articleTitle = mainData.Substring(mainData.IndexOf("target=\"_blank\">") + 16,
                                                                    mainData.IndexOf("</a>") - mainData.IndexOf("target=\"_blank\">") - 16);

                           //下载文章页面数据
                           string articleData = Encoding.UTF8.GetString(wc.DownloadData(string.Format(articleAddr, i.ToString())));

                           //截取文章内容HTML
                           articleData = articleData.Substring(articleData.IndexOf("<div id=\"cnblogs_post_body\">") + 28);

                           articleData = articleData.Substring(0, articleData.IndexOf("if ($ != jQuery) {") - 33);

                           articleData = "\r\n"+articleTitle + "\r\n\r\n" +"\r\n"+articleData;

                           //输出数据到本地文件
                           //string pth = CreateFolderIfNot(Settings.Default.OutPath) + i + "_" + j + ".txt";
                           //if (!File.Exists(pth))
                           //    File.AppendAllText(pth,
                           //                       articleData,
                           //                       Encoding.UTF8);




                           string sqlO = @"insert into [Word_Content2](Title,Content) values ('" + articleTitle + "','" + articleData + "')";
                           SqlCommand cmd2 = new SqlCommand(sqlO, con);
                           cmd2.ExecuteNonQuery();
                           


                           j++;
                       }
                       catch (Exception ex)
                       {
                       }
                   }
               }
               con.Close();
           }
           #endregion

        }
    }
  • 相关阅读:
    [转] CNN工作步骤解析
    [转] Attention模型结构
    [转] Boost算法
    [转] GDBT详解
    [转] Noise Contrastive Estimation 噪声对比估计 资料
    [转] 对数似然与交叉熵
    [转] ELMO
    [转] Batch Normalization
    强化学习总结
    MySQL 与 Hive 逻辑相关
  • 原文地址:https://www.cnblogs.com/shihao/p/2506909.html
Copyright © 2011-2022 走看看