zoukankan      html  css  js  c++  java
  • 爬虫-HtmlAgilityPack

    写了一个简单爬婴儿配方奶粉的小爬虫,使用HtmlAgilityPack
    HtmlAgilityPack:https://html-agility-pack.net/

    参考

    HtmlAgilityPack - 详细简介和使用

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Net;
    using System.Web;
    using System.Web.Mvc;
    using CrawlerForBaby.Models;
    using CrawlerForBaby.Untity;
    using HtmlAgilityPack;
    
    namespace CrawlerForBaby.Controllers
    {
        public class HomeController : Controller
        {
    
            public ActionResult Index()
            {
                return View();
            }
    
            public JsonResult GetTables(int page, int limit)
            {
                using (CrawlerForBabyEntities1 db = new CrawlerForBabyEntities1())
                {
                    var query = (from s in db.BabyRecipe
                                 join p in db.Product on s.ProductId equals p.ProductId
                                 select new DTOList
                                 {
                                     Id = s.Id,
                                     SerialNum = s.SerialNum,
                                     Project = s.Project,
                                     Unit = s.Unit,
                                     EveryHundredKJ = s.EveryHundredKJ,
                                     EveryHundredG = s.EveryHundredG,
                                     ProductId = p.ProductId,
                                     RegistrationID = p.RegistrationID,
                                     CommonName = p.CommonName,
                                     ProductName = p.ProductName,
                                     EngLishName = p.EngLishName,
                                     Process = p.Process,
                                     ProcessName = p.ProcessName,
                                     IsRawMilkSkim = p.IsRawMilkSkim,
                                     Type = p.Type
                                 }).AsQueryable();
    
                    var tables = query.OrderBy(s => s.Id).Skip((page - 1) * limit).Take(limit).ToList();
    
                    return Json(new ResultModel<DTOList>() { success = true, code = 0, count = query.Count(), data = tables, msg = "" }, JsonRequestBehavior.AllowGet);
                }
            }
    
            [HttpPost]
            public JsonResult AddTables(string json)
            {
                if (json.IndexOf("http://tsspxx.gsxt.gov.cn/tyyp/detailyp.xhtml") < 0)
                {
                    return Json(new ResultModel<string>() { success = false, msg = "链接不对" });
                }
    
    
                var url = json;
                var web = new HtmlWeb();
                var doc = web.Load(url);
    
                // With LINQ 
                var zch = doc.DocumentNode.Descendants("tr")
                 .Where(x => x.ChildNodes["th"].InnerText == "注册号")
                 .FirstOrDefault();
    
                string registNumber = zch.ChildNodes["td"].InnerText.Substring(4, zch.ChildNodes["td"].InnerText.Length - 4);
    
                var commonName = doc.DocumentNode.Descendants("tr")
                .Where(x => x.ChildNodes["th"].InnerText == "通用名称(产品)")
                .FirstOrDefault();
    
                var productName = doc.DocumentNode.Descendants("tr")
                .Where(x => x.ChildNodes["th"].InnerText == "商品名称(产品)")
                .FirstOrDefault();
    
                var ENName = doc.DocumentNode.Descendants("tr")
                .Where(x => x.ChildNodes["th"].InnerText == "英文名称(产品)")
                .FirstOrDefault();
    
                var process = doc.DocumentNode.Descendants("tr")
                .Where(x => x.ChildNodes["th"].InnerText == "生产工艺")
                .FirstOrDefault();
    
    
                var url1 = "http://tsspxx.gsxt.gov.cn:80//tyyp/detailPf.xhtml?COLUMN1667=%25E5%259B%25BD%25E9%25A3%259F%25E6%25B3%25A8%25E5%25AD%2597" + registNumber;
                var web1 = new HtmlWeb();
                var doc1 = web1.Load(url1);
                var headers = HTTPHeader.GetHTTPResponseHeaders(url1);
                string cookie = headers["Set-Cookie"];
    
                var url2 = "http://tsspxx.gsxt.gov.cn:80//tyyp/yppfpage.xhtml?currentPage=6";
                var web2 = new HtmlWeb();
                HtmlAgilityPack.HtmlWeb.PreRequestHandler handler = delegate (HttpWebRequest request)
                {
                    request.Headers[HttpRequestHeader.AcceptEncoding] = "gzip, deflate";
                    request.Headers[HttpRequestHeader.Cookie] = cookie;
                    //request.Headers[HttpRequestHeader.Referer] = url1;
                    request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip;
                    return true;
                };
                web2.PreRequest += handler;
                var doc2 = web2.Load(url2);
    
                var list = GetPageData.GetData(cookie);
    
                using (CrawlerForBabyEntities1 db = new CrawlerForBabyEntities1())
                {
                    string commonNameText = commonName.ChildNodes["td"].InnerText;
                    string productNameText = productName.ChildNodes["td"].InnerText;
                    
    
                    var isAdd = (from s in db.Product
                                 where s.CommonName == commonNameText && s.ProductName == productNameText
                                 select s).Any();
                    if (!isAdd)
                    {
                        Product product = new Product()
                        {
                            RegistrationID = zch.ChildNodes["td"].InnerText,
                            CommonName = commonNameText,
                            ProductName = productNameText,
                            EngLishName = ENName.ChildNodes["td"].InnerText,
                            Process = process.ChildNodes["td"].InnerText,
                        };
    
                        db.Product.Add(product);
                        db.SaveChanges();
    
                        foreach (var item in list)
                        {
                            var tds = item.DocumentNode.Descendants("td").ToList();
    
                            for (int i = 0; i < tds.Count; i++)
                            {
                                if (i % 5 == 0)
                                {
                                    double kjResult = 0;
                                    double gResult = 0;
                                    var kj = double.TryParse(tds[i + 3].InnerText, out kjResult);
                                    var g = double.TryParse(tds[i + 4].InnerText, out gResult);
                                    BabyRecipe model = new BabyRecipe()
                                    {
                                        SerialNum = Convert.ToInt32(tds[i].InnerText),
                                        Project = tds[i + 1].InnerText,
                                        Unit = tds[i + 2].InnerText,
                                        EveryHundredKJ = double.Parse(kjResult.ToString("0.00")),
                                        EveryHundredG = double.Parse(gResult.ToString("0.00")),
                                        ProductId = product.ProductId
                                    };
                                    db.BabyRecipe.Add(model);
                                    db.SaveChanges();
                                }
                            }
                        }
                    }
                    else
                    {
                        return Json(new ResultModel<string>() { success = false, msg = "已经存在" });
                    }
    
                }
    
                return Json(new ResultModel<string>() { success = true, code = 0, msg = "" }, JsonRequestBehavior.AllowGet);
            }
        }
    }
    

    前端

    @{
        ViewBag.Title = "Home Page";
    }
    <link href="~/Content/layui/css/layui.css" rel="stylesheet" />
    <script src="~/Content/layui/layui.js"></script>
    
    <div class="jumbotron">
        <h1>ASP.NET</h1>
    </div>
    
    
    <form class="layui-form" action="">
        <div class="layui-form-item layui-form-text">
            <label class="layui-form-label">URL</label>
            <div class="layui-input-block">
                示例:<a href="http://tsspxx.gsxt.gov.cn/tyyp/detailyp.xhtml?id=A4FA632E8E15B4D8E055620810C6201A" target="_blank">http://tsspxx.gsxt.gov.cn/tyyp/detailyp.xhtml?id=A4FA632E8E15B4D8E055620810C6201A</a>
            </div>
            <div class="layui-input-block">
                <textarea name="json" placeholder="请输入URL" class="layui-textarea" id="textjson" style="height: 50px;min-height:50px"></textarea>
            </div>
        </div>
        <div class="layui-form-item">
            <div class="layui-input-block">
                <button class="layui-btn" lay-submit lay-filter="formDemo">立即提交</button>
                <button type="reset" class="layui-btn layui-btn-primary">重置</button>
            </div>
        </div>
    </form>
    
    <div class="row">
        <table id="demo" lay-filter="test"></table>
    </div>
    
    <style>
        .layui-table-cell {
            height: 44px;
            line-height: 44px;
        }
    
        .optherName:active {
        }
    
        .optherName:hover {
            color: #ffffff;
            background-color: #379736;
        }
    </style>
    
    <script>
        layui.use(['laydate','table', 'form'], function () {
            var table = layui.table;
            var form = layui.form;
            var laydate = layui.laydate;
    
                    //监听提交
            form.on('submit(formDemo)',
                function (data) {
                    $.ajax({
                        url: "Home/AddTables",
                        //dataType: 'text',
                        contentType: "application/x-www-form-urlencoded",
                        data: data.field, //请求的附加参数,用json对象
                        method: 'POST',
                        success: function (res) {
                            console.log(res);
                            if (res.success) {
                                layer.msg("新增成功!");
                                tableObj.reload(); //重载表格
                            } else {
                                layer.msg(res.msg);
                            }
    
                        }
                    });
                    $('#textjson').val('');
                    return false;
                });
    
            //第一个实例
            var tableObj = table.render({
                elem: '#demo'
                , height: 312
                , url: '/home/GetTables/' //数据接口
                , page: true //开启分页
                , cols: [[ //表头
                    { field: 'Id', title: 'ID',  80, sort: true, fixed: 'left' }
                    , { field: 'SerialNum', title: '序号',  60 }
                    , { field: 'CommonName', title: '通用名',  240 }
                    , { field: 'ProductName', title: '产品名',  140 }
                    , { field: 'EngLishName', title: '英文',  120 }
                    , { field: 'Process', title: '工艺',  150 }
                    , { field: 'Unit', title: '单位',  80 }
                    , { field: 'Project', title: '项目',  250, sort: true }
                    , { field: 'Unit', title: '单位',  80 }
                    , { field: 'EveryHundredKJ', title: '每100kJ',  120 }
                    , { field: 'EveryHundredG', title: '每100g',  120, sort: true }
                ]]
            });
        });
    </script>
    
  • 相关阅读:
    单片机就那点资源,为啥还要用RTOS?
    JVM 虚拟机参数配置
    C# 多态virtual标记重写 以及EF6 查询性能AsNoTracking
    C# HttpClient发送请求获取接口数据
    C# Socket服务端和客户端通话
    C# 生成图片验证码 图片缩略图 水印
    ADO.NET 帮助类 参数传递 存储过程 分页
    hadoop单机部署
    tengine-sticky
    redis持久化
  • 原文地址:https://www.cnblogs.com/tangge/p/12834083.html
Copyright © 2011-2022 走看看