zoukankan      html  css  js  c++  java
  • 网站数据采集

    //根据Url地址得到网页的html源码
    private string GetWebContent(string sUrl)
    {
               // string sURL="";
       string sLine = "";
       string sLinepage = "";
       int i = 0;
       WebRequest wrGETURL;
                //for (int j=1;j<=1;j++)
                //{
               // sURL = " http://app1.sfda.gov.cn/datasearch/face3/content.jsp?tableId=25&tableName=TABLE25&tableView=%E5%9B%BD%E4%BA%A7%E8%8D%AF%E5%93%81&Id=1";//+j.ToString();
        wrGETURL = WebRequest.Create(sUrl);
        Stream objStream = wrGETURL.GetResponse().GetResponseStream();
        StreamReader objReader = new StreamReader(objStream);
        sLinepage = "";
        sLine = objReader.ReadLine();
        while (sLine!=null)
        {
         i++;
         sLine = objReader.ReadLine();
         if (sLine!=null)
         {
          sLinepage=sLinepage+sLine;
          
         }
        }
                    return sLinepage;
            }
            //得到指定字串之间的数据
            private string SplitStr(string src, string startstr, string stopstr)
            {
                //找到开始字符的位置
                string resultstr;
                int startpos=0;
                int stoppos=0;
                MatchCollection Matches = Regex.Matches(src, startstr, RegexOptions.None);
                foreach (Match NextMatch in Matches)
                {
                    startpos=NextMatch.Index+startstr.Length;
                }
              

                MatchCollection Matches2 = Regex.Matches(src, stopstr, RegexOptions.None);
                foreach (Match NextMatch2 in Matches2)
                {
                    stoppos = NextMatch2.Index;
                }
                if (stoppos < startpos)
                    stoppos = startpos;
                if (stopstr == "结束符")
                    stoppos = src.Length;
                resultstr = src.Substring(startpos, stoppos - startpos);
               // MessageBox.Show(resultstr);
                return resultstr;
            }
            private void button1_Click(object sender, EventArgs e)
            {
                //要抓取的URL地址
                string date1 = DateTime.Now.ToString("yymmddhhmmss");
                MessageBox.Show(date1);
                int j;
                string src;
                string strWebContent;
                string desc, sr, sp;
                desc = "";
                WebBrowser webfda = new WebBrowser();
                for (j = 1; j <= 10; j++)
                {
                    string Url = "http://app1.sfda.gov.cn/datasearch/face3/content.jsp?tableId=26&tableName=TABLE26&tableView=%E5%9B%BD%E4%BA%A7%E5%99%A8%E6%A2%B0&Id=" + j.ToString();

                    //得到指定Url的源码
                    strWebContent = GetWebContent(Url);

                    //生成HtmlDocument
                    label1.Text = j.ToString();

                    webfda.Navigate("about:blank");
                    HtmlDocument htmldoc = webfda.Document.OpenNew(true);
                    htmldoc.Write(strWebContent);
                    //textBox1 .Text= htmldoc.Body.InnerHtml;
                    // textBox2.Text = htmldoc.Body.InnerText;
                    //生产场所
                    src = htmldoc.Body.InnerText;
                    sr = "生产场所";
                    sp = "变更日期";//
                    desc = SplitStr(src, sr, sp);

                    sr = "变更日期";
                    sp = "备注";//
                    desc = desc + "," + SplitStr(src, sr, sp);

                    sr = "备注";
                    sp = "注册号";//
                    desc = desc + "," + SplitStr(src, sr, sp);

                    sr = "注册号";
                    sp = "生产单位";//
                    desc = desc + "," + SplitStr(src, sr, sp);

                    sr = "生产单位";
                    sp = "地址";//
                    desc = desc + "," + SplitStr(src, sr, sp);

                    sr = "地址";
                    sp = "邮编";//
                    desc = desc + "," + SplitStr(src, sr, sp);

                    sr = "邮编";
                    sp = "产品名称";//
                    desc = desc + "," + SplitStr(src, sr, sp);

                    sr = "产品名称";
                    sp = "产品标准";//
                    desc = desc + "," + SplitStr(src, sr, sp);

                    sr = "产品标准";
                    sp = "产品性能结构及组成";//
                    desc = desc + "," + SplitStr(src, sr, sp);

                    sr = "产品性能结构及组成";
                    sp = "有效期";//
                    desc = desc + "," + SplitStr(src, sr, sp);

                    sr = "有效期";
                    sp = "批准日期";//
                    desc = desc + "," + SplitStr(src, sr, sp);

                    sr = "批准日期";
                    sp = "产品适用范围";//
                    desc = desc + "," + SplitStr(src, sr, sp);

                    sr = "产品适用范围";
                    sp = "规格型号";//
                    desc = desc + "," + SplitStr(src, sr, sp);

                    sr = "规格型号";
                    sp = "结束符";//
                    desc = desc + "," + SplitStr(src, sr, sp) + "\n";
                    label2.Text = j.ToString();
                    textBox2.Text = textBox2.Text + desc;
                    desc = "";
                }
               
            }

  • 相关阅读:
    Postman测试写法的问题
    Spring Cloud
    Swagger2构造RESTful API开发Java Web
    web前端三大框架(主流Vue.js)
    zookeeper
    分布式服务介绍
    2019年java技术大盘点
    refusing to merge unrelated histories
    PPT制作不加班的十个小窍门
    如何快速完成一份学术型PPT
  • 原文地址:https://www.cnblogs.com/hhq80/p/972209.html
Copyright © 2011-2022 走看看