zoukankan      html  css  js  c++  java
  • C#运用实例.读取csv里面的词条,对每一个词条抓取百度百科相关资料,然后存取到数据库

    第一步:首先需要将csv先装换成datatable,这样我们就容易进行对datatable进行遍历:

      /// 将CSV文件的数据读取到DataTable中
        /// CSV文件路径
        /// 返回读取了CSV数据的DataTable
        public DataTable OpenCSV(string fileName)
        {
            DataTable dt = new DataTable();
            FileStream fs = new FileStream(fileName, System.IO.FileMode.Open, System.IO.FileAccess.Read);
            StreamReader sr = new StreamReader(fs, System.Text.Encoding.Default);
            //记录每次读取的一行记录
            string strLine = "";
            //记录每行记录中的各字段内容
            string[] aryLine;
            //标示列数
            int columnCount = 0;
            //标示是否是读取的第一行
            bool IsFirst = true;
            //逐行读取CSV中的数据
            while ((strLine = sr.ReadLine()) != null)
            {
                aryLine = strLine.Split(',');
                if (IsFirst == true)
                {
                    IsFirst = false;
                    columnCount = aryLine.Length;
                    //创建列
                    for (int i = 0; i < columnCount; i++)
                    {
                        DataColumn dc = new DataColumn(aryLine[i]);
                        dt.Columns.Add(dc);
                    }
                }
                else
                {
                    DataRow dr = dt.NewRow();
                    for (int j = 0; j < columnCount; j++)
                    {
                        dr[j] = aryLine[j];
                    }
                    dt.Rows.Add(dr);
                }
            }
            sr.Close();
            fs.Close();
            return dt;
        }
    将csv文件转换成datatable

    特别备注:遍历dadatable的几种方式已经将datatable装换成csv文件:

    一、将DataTable内容写入到CSV文件
            ///
    
            /// 将DataTable中的数据保存成CSV文件
            ///
    
            private void btnSaveCSV_Click(object sender, EventArgs e)
            {
                saveFileDialog1.Filter = "CSV文件|*.CSV";
                saveFileDialog1.InitialDirectory = "C:\";
                if (saveFileDialog1.ShowDialog() == DialogResult.Cancel)
                {
                    return;
                }
                else
                {
                    string fileName = saveFileDialog1.FileName;
                    SaveCSV(ds.Tables[0], fileName);
                }
            }
     
            ///
    
            /// 将DataTable中数据写入到CSV文件中
            ///
    
            /// 提供保存数据的DataTable
            /// CSV的文件路径
            public void SaveCSV(DataTable dt, string fileName)
            {
                FileStream fs = new FileStream(fileName, System.IO.FileMode.Create, System.IO.FileAccess.Write);
                StreamWriter sw = new StreamWriter(fs, System.Text.Encoding.Default);
                string data = "";
                //写出列名称
                for (int i = 0; i < dt.Columns.Count; i++)
                {
                    data += dt.Columns[i].ColumnName.ToString();
                    if (i < dt.Columns.Count - 1)
                    {
                        data += ",";
                    }
                }
                sw.WriteLine(data);
                //写出各行数据
                for (int i = 0; i < dt.Rows.Count; i++)
                {
                    data = "";
                    for (int j = 0; j < dt.Columns.Count; j++)
                    {
                        data += dt.Rows[i][j].ToString();
                        if (j < dt.Columns.Count - 1)
                        {
                            data += ",";
                        }
                    }
                    sw.WriteLine(data);
                }
                sw.Close();
                fs.Close();
                MessageBox.Show("CSV文件保存成功!");
            }
     
    将datatable转换成csv
    遍历datatable的方法2009-09-08 10:02方法一:   
    DataTable dt = dataSet.Tables[0];   
    for(int i = 0 ; i < dt.Rows.Count ; i++)   
    {   
       string strName = dt.Rows[i]["字段名"].ToString();   
    }   
      
    方法二:   
    foreach(DataRow myRow in myDataSet.Tables["temp"].Rows)   
    {   
          var str = myRow[0].ToString();   
    }   
      
    方法三:   
    foeach(DataRow dr in dt.Rows)      
    {      
         object value = dr["ColumnsName"];      
    }   
      
    方法四:   
    DataTable dt=new DataTable();      
    foreach(DataRow dr in dt.Rows)      
    {    
       for(int i=0;i<dt.Columns.Count;i++)    
       {    
    dr[i];    
       }      
    }   
      
     
    方法五
    DataRow[] dataRows = null;
    dataRows = dataTable.Select(fieldParentID + "='" + treeNode.Tag.ToString() + "'", dataTable.DefaultView.Sort);
    foreach (DataRow dataRow in dataRows)
    { 
    
         DataRow dataRow = dataTable.Rows[i]; 
    
         ?? = dataRow[fieldParentID].ToString();
    } 
    遍历datatable的几种方式

    第二步呢: 就是必须用到post方式提交啦,

        /// <summary>
        /// post数据请求,返回html代码
        /// </summary>
        /// <param name="areaName">输入百科词条</param>
        /// <returns></returns>
        public string UrlPost(string areaName)
        {
            Encoding encoding = Encoding.GetEncoding("UTF-8");
            Stream outstream = null;
            Stream instream = null;
            StreamReader sr = null;
    
            string url = "http://baike.baidu.com/search/word?word=" + areaName;
    
            HttpWebRequest request = null;
    
            HttpWebResponse response = null;
    
    
            // 准备请求,设置参数
    
            request = WebRequest.Create(url) as HttpWebRequest;
    
            request.Method = "POST";
    
            //request.ContentType = "application/x-www-form-urlencoded";
            request.ContentType = "text/xml";//注意了,上面的那种方式不可采用,因为当遇到xml格式的就会报远程服务器错误。500
    
    
            byte[] data = encoding.GetBytes(url);
    
            request.ContentLength = data.Length;
    
            outstream = request.GetRequestStream();
    
            outstream.Write(data, 0, data.Length);
    
            outstream.Flush();
    
            outstream.Close();
    
            //发送请求并获取相应回应数据
    
    
            response = request.GetResponse() as HttpWebResponse;
    
            //直到request.GetResponse()程序才开始向目标网页发送Post请求
    
            instream = response.GetResponseStream();
    
            sr = new StreamReader(instream, encoding);
    
            //返回结果网页(html)代码
    
            string content = sr.ReadToEnd();
            return content;
    
        }
    post方式请求数据返回html数据

     第三部,就是遍历datatable,然后遍历出每个关键字,以post方式提交到百度百科,返回html,然后用正则表达式进行对数据的筛选

        /// <summary>
        /// 点击按钮执行程序
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        protected void btn_click_Click(object sender, EventArgs e)
        {
            DataTable dts = OpenCSV("C:\Users\kingtopinfo\Desktop\CITYLOCATION.csv");
           
            foreach (DataRow dr in dts.Rows)
            {
                string sql = "";
                string regstr = "";
                string regspit = "";
                string Title, Content = "";
                int Id = Convert.ToInt32(dr["ID"].ToString());
                int CityId = Convert.ToInt32(dr["CITYID"].ToString());
                decimal Latitude = Convert.ToDecimal(dr["LATITUDE"]);
                decimal Longitude = Convert.ToDecimal(dr["LONGITUDE"]);
                //提交数据给页面并返回html
                string value = dr["CITYNAME"].ToString();
                string html = UrlPost(value);
                //对数据进行处理。
                Regex reg = new Regex(@"<div class=""biItemInner"">(s|S)+?(</div>s*?)");
                MatchCollection mc = reg.Matches(html);
                foreach (Match m in mc)
                {
                    regspit += Regex.Replace(Regex.Replace(m.Value, @"</div[^>]*>", ":"), @"</span[^>]*>", ",");
                   
                }
                regstr = Regex.Replace(regspit, @"</?div[^>]*>|</?span[^>]*>|</?a[^>]*>|&nbsp;", "");
                string[] area = Regex.Split(regstr, ":", RegexOptions.IgnoreCase);
                foreach (string i in area)
                {
                    string[] areaspit = i.Split(',');
                    if (areaspit.Length==2)
                    {
                        Title = areaspit[0];
                        Content = areaspit[1];
                        SqlParameter[] paras = { 
                                               new SqlParameter("@Id", Id),
                                               new SqlParameter("@cityId", CityId),
                                               new SqlParameter("@latitude", Latitude),
                                               new SqlParameter("@longitude", Longitude),
                                               new SqlParameter("@cityName", value),
                                               new SqlParameter("@title", Title),
                                               new SqlParameter("@content", Content ),
                                               };
                       
                        sql = "INSERT INTO chinaarea(ID,CITYID,LATITUDE,LONGITUDE,CITYNAME,TITLE,CONTENT) VALUES (@Id,@cityId,@latitude,@longitude,@cityName,@title,@content)";
                        SaveSql(sql,paras);
                   
                    }
                       
                     
                }
            }
        }
    遍历表格,发送请求,正则筛选,存入数据库

    第四部分,就是数据库的操作了

    const string connString = @"Data Source =.;Database = Area;Integrated Security=True";
    
      /// <summary>
        /// 存入数据库
        /// </summary>
        /// <param name="str"></param>
        public void SaveSql(string str,SqlParameter[] para)
        {
            SqlConnection conn = new SqlConnection(connString);
            conn.Open();
            string mysql = str;
            SqlCommand cmd = new SqlCommand(mysql, conn);
            cmd.Parameters.AddRange(para);
            cmd.ExecuteNonQuery();
            conn.Close();
        }
    数据库操作

     此外,这里对正则表达式进行解释

     因为返回的都是这种类型的数据,而我们需要得到的是里面的内容,所以采取正则匹配和替换

    <div class="biItem"><div class="biItemInner"><span class="biTitle">中文名称</span><div class="biContent">北京</div></div></div>

    <div class="biItem"><div class="biItemInner"><span class="biTitle">外文名称</span><div class="biContent">Beijing</div></div></div>

     Regex reg = new Regex(@"<div class=""biItemInner"">(s|S)+?(</div>s*?)");     找出<div class="biItemInner"><div>里的内容

    Regex.Replace(Regex.Replace(m.Value, @"</div[^>]*>", ":"), @"</span[^>]*>", ","); 对div和span进行替换和以:分组(便于之后数据处理)

    Regex.Replace(regspit, @"</?div[^>]*>|</?span[^>]*>|</?a[^>]*>|&nbsp;", ""); 将regspit里面&nbsp 还有a标签进行替换(也就是去掉里面的这些标签)

    最终的代码整合:

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Web;
    using System.Web.UI;
    using System.Web.UI.WebControls;
    using System.Data;
    using System.Data.SqlClient;
    using System.Text;
    using System.IO;
    using System.Net;
    using System.Text.RegularExpressions;
    public partial class csv : System.Web.UI.Page
    {
        const string connString = @"Data Source =.;Database = Area;Integrated Security=True";
        protected void Page_Load(object sender, EventArgs e)
        {
           
        }
        /// <summary>
        /// 点击按钮执行程序
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        protected void btn_click_Click(object sender, EventArgs e)
        {
            DataTable dts = OpenCSV("C:\Users\kingtopinfo\Desktop\CITYLOCATION.csv");
           
            foreach (DataRow dr in dts.Rows)
            {
                string sql = "";
                string regstr = "";
                string regspit = "";
                string Title, Content = "";
                int Id = Convert.ToInt32(dr["ID"].ToString());
                int CityId = Convert.ToInt32(dr["CITYID"].ToString());
                decimal Latitude = Convert.ToDecimal(dr["LATITUDE"]);
                decimal Longitude = Convert.ToDecimal(dr["LONGITUDE"]);
                //提交数据给页面并返回html
                string value = dr["CITYNAME"].ToString();
                string html = UrlPost(value);
                //对数据进行处理。
                Regex reg = new Regex(@"<div class=""biItemInner"">(s|S)+?(</div>s*?)");
                MatchCollection mc = reg.Matches(html);
                foreach (Match m in mc)
                {
                    regspit += Regex.Replace(Regex.Replace(m.Value, @"</div[^>]*>", ":"), @"</span[^>]*>", ",");
                   
                }
                regstr = Regex.Replace(regspit, @"</?div[^>]*>|</?span[^>]*>|</?a[^>]*>|&nbsp;", "");
                string[] area = Regex.Split(regstr, ":", RegexOptions.IgnoreCase);
                foreach (string i in area)
                {
                    string[] areaspit = i.Split(',');
                    if (areaspit.Length==2)
                    {
                        Title = areaspit[0];
                        Content = areaspit[1];
                        SqlParameter[] paras = { 
                                               new SqlParameter("@Id", Id),
                                               new SqlParameter("@cityId", CityId),
                                               new SqlParameter("@latitude", Latitude),
                                               new SqlParameter("@longitude", Longitude),
                                               new SqlParameter("@cityName", value),
                                               new SqlParameter("@title", Title),
                                               new SqlParameter("@content", Content ),
                                               };
                       
                        sql = "INSERT INTO chinaarea(ID,CITYID,LATITUDE,LONGITUDE,CITYNAME,TITLE,CONTENT) VALUES (@Id,@cityId,@latitude,@longitude,@cityName,@title,@content)";
                        SaveSql(sql,paras);
                   
                    }
                       
                     
                }
            }
        }
    
        /// 将CSV文件的数据读取到DataTable中
        /// CSV文件路径
        /// 返回读取了CSV数据的DataTable
        public DataTable OpenCSV(string fileName)
        {
            DataTable dt = new DataTable();
            FileStream fs = new FileStream(fileName, System.IO.FileMode.Open, System.IO.FileAccess.Read);
            StreamReader sr = new StreamReader(fs, System.Text.Encoding.Default);
            //记录每次读取的一行记录
            string strLine = "";
            //记录每行记录中的各字段内容
            string[] aryLine;
            //标示列数
            int columnCount = 0;
            //标示是否是读取的第一行
            bool IsFirst = true;
            //逐行读取CSV中的数据
            while ((strLine = sr.ReadLine()) != null)
            {
                aryLine = strLine.Split(',');
                if (IsFirst == true)
                {
                    IsFirst = false;
                    columnCount = aryLine.Length;
                    //创建列
                    for (int i = 0; i < columnCount; i++)
                    {
                        DataColumn dc = new DataColumn(aryLine[i]);
                        dt.Columns.Add(dc);
                    }
                }
                else
                {
                    DataRow dr = dt.NewRow();
                    for (int j = 0; j < columnCount; j++)
                    {
                        dr[j] = aryLine[j];
                    }
                    dt.Rows.Add(dr);
                }
            }
            sr.Close();
            fs.Close();
            return dt;
        }
    
        /// <summary>
        /// 存入数据库
        /// </summary>
        /// <param name="str"></param>
        public void SaveSql(string str,SqlParameter[] para)
        {
            SqlConnection conn = new SqlConnection(connString);
            conn.Open();
            string mysql = str;
            SqlCommand cmd = new SqlCommand(mysql, conn);
            cmd.Parameters.AddRange(para);
            cmd.ExecuteNonQuery();
            conn.Close();
        }
    
        /// <summary>
        /// post数据请求,返回html代码
        /// </summary>
        /// <param name="areaName">输入百科词条</param>
        /// <returns></returns>
        public string UrlPost(string areaName)
        {
            Encoding encoding = Encoding.GetEncoding("UTF-8");
            Stream outstream = null;
            Stream instream = null;
            StreamReader sr = null;
    
            string url = "http://baike.baidu.com/search/word?word=" + areaName;
    
            HttpWebRequest request = null;
    
            HttpWebResponse response = null;
    
    
            // 准备请求,设置参数
    
            request = WebRequest.Create(url) as HttpWebRequest;
    
            request.Method = "POST";
    
            //request.ContentType = "application/x-www-form-urlencoded";
            request.ContentType = "text/xml";//注意了,上面的那种方式不可采用,因为当遇到xml格式的就会报远程服务器错误。500
    
    
            byte[] data = encoding.GetBytes(url);
    
            request.ContentLength = data.Length;
    
            outstream = request.GetRequestStream();
    
            outstream.Write(data, 0, data.Length);
    
            outstream.Flush();
    
            outstream.Close();
    
            //发送请求并获取相应回应数据
    
    
    
    
    
            response = request.GetResponse() as HttpWebResponse;
    
            //直到request.GetResponse()程序才开始向目标网页发送Post请求
    
            instream = response.GetResponseStream();
    
            sr = new StreamReader(instream, encoding);
    
            //返回结果网页(html)代码
    
            string content = sr.ReadToEnd();
            return content;
    
        }
    
        
    
    }
    View Code
  • 相关阅读:
    神奇的flex布局
    reset、revert、rebase
    Vue.filter过滤器
    moment.js时间格式化总结
    Vue之组件大全
    过滤器filter
    Vue之animate
    Vue之axios
    Mac OS系统上测试PHP代码前的准备工作 | 使用XAMPP搭建Apache服务器的步骤
    Python中的标识符、关键字、变量、语句、注释、模块
  • 原文地址:https://www.cnblogs.com/mancomeon/p/4655321.html
Copyright © 2011-2022 走看看