using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using System.Net;
using System.IO;
using System.Data;
using System.Data.SqlClient;
using CnBlogCollector.Properties;
namespace CnBlogCollector
{
/// <summary>
/// 数据采集类
/// </summary>
public class Collector
{
#region 变量
private string cnblogMain = "http://www.cnblogs.com/p{0}";//cnblog首页地址
private WebClient wc = new WebClient();
#endregion
#region 创建目录
/// <summary>
/// 判断目录是否存在,若不存在则创建该目录
/// </summary>
/// <param name="path"></param>
/// <returns></returns>
public string CreateFolderIfNot(string path)
{
//获取该目录的完整路径
string rtn = Path.GetFullPath(path);
//若该目录不存在
if (!Directory.Exists(rtn))
{
//创建该目录
Directory.CreateDirectory(rtn);
}
return rtn;
}
#endregion
#region 采集网页数据
public void Gather(int startIndex, int endIndex)
{
SqlConnection con = new SqlConnection(@"Data Source=.;Initial Catalog=WordProject;User ID=misp2;Password=misp2;");
con.Open();
//根据startIndex和endIndex来遍历cnblog首页上文章
for (int i = startIndex; i < endIndex; i++)
{
//从cnblog首页下载页面数据并将其转换成UTF8编码格式的STRING
string mainData = Encoding.UTF8.GetString(wc.DownloadData(string.Format(cnblogMain, i.ToString())));
int j = 1;
//二次遍历抓取cnblog首页面上的文章链接,并顺着这些链接进入文章页面采集数据
while (mainData.IndexOf("<a class=\"titlelnk\" href=\"") >= 0)
{
try
{
mainData = mainData.Substring(mainData.IndexOf("<a class=\"titlelnk\" href=\"") + 26);
//获取文章页面的链接地址
string articleAddr = mainData.Substring(0, mainData.IndexOf("\""));
//获取文章标题
string articleTitle = mainData.Substring(mainData.IndexOf("target=\"_blank\">") + 16,
mainData.IndexOf("</a>") - mainData.IndexOf("target=\"_blank\">") - 16);
//下载文章页面数据
string articleData = Encoding.UTF8.GetString(wc.DownloadData(string.Format(articleAddr, i.ToString())));
//截取文章内容HTML
articleData = articleData.Substring(articleData.IndexOf("<div id=\"cnblogs_post_body\">") + 28);
articleData = articleData.Substring(0, articleData.IndexOf("if ($ != jQuery) {") - 33);
articleData = "\r\n"+articleTitle + "\r\n\r\n" +"\r\n"+articleData;
//输出数据到本地文件
//string pth = CreateFolderIfNot(Settings.Default.OutPath) + i + "_" + j + ".txt";
//if (!File.Exists(pth))
// File.AppendAllText(pth,
// articleData,
// Encoding.UTF8);
string sqlO = @"insert into [Word_Content2](Title,Content) values ('" + articleTitle + "','" + articleData + "')";
SqlCommand cmd2 = new SqlCommand(sqlO, con);
cmd2.ExecuteNonQuery();
j++;
}
catch (Exception ex)
{
}
}
}
con.Close();
}
#endregion
}
}