zoukankan      html  css  js  c++  java
  • 某查查企业爬虫(模拟人工验证码)

    1、.net core 3.1 C#  selenium 

    //爬取所有省份、城市、区县
    public override async Task WriteAreaToFileAsync(string configPath, string directory)
            {
                const string BaseProvinceLink = "https://www.qcc.com/search?key={keyword}#industrycode:K&";
                const string BaseCityLink = "https://www.qcc.com/search_getCityListHtml?province={0}";
                const string BaseCountyLink = "https://www.qcc.com/search_getCountyListHtml?city={0}";
                List<string> provinces = await GetCodeAsync(new Uri(BaseProvinceLink), ".sfilter-tag.clearfix.provinceChoose dd a");
                const string baseText = "province:{0}&city:{1}&county:{2}&";
                List<string> list = new List<string>();
                foreach (var province in provinces)
                {
                    StringBuilder.Clear();
                    Uri provinceUri = new Uri(StringBuilder.AppendFormat(BaseCityLink, province).ToString());
                    var cities = await GetCodeAsync(provinceUri, "dd a");
                    foreach (var city in cities)
                    {
                        StringBuilder.Clear();
                        Uri cityUri = new Uri(StringBuilder.AppendFormat(BaseCountyLink, city).ToString());
                        var counties = await GetCodeAsync(cityUri, "dd a");
                        foreach (var county in counties)
                        {
                            StringBuilder.Clear();
                            StringBuilder.Append(BaseProvinceLink);
                            string area = StringBuilder.AppendFormat(baseText, province, city, county).
                                Replace("search", "search_index").Replace("中介#", "中介&ajaxflag=1&")
                                .Replace(":industrycode", "=industrycode").ToString();
                            list.Add(area);
                        }
                    }
                }
                await File.WriteAllLinesAsync("企查查.txt", list);
            }
    
    //分页爬取企业信息
            private async Task<bool> GetAgentsAsync(Uri cityUri)
            {
                LogHelper.Info(cityUri.ToString());
                var pageSource = await HttpClient.GetStringAsync(cityUri);
                while (!pageSource.Contains("查企业"))
                {
                    if (pageSource.StartsWith("<script>window.location"))
                    {
                        VertifyCode(new Uri(pageSource.Split("'")[1]));
                        pageSource = await HttpClient.GetStringAsync(cityUri);
                    }
                    else if (pageSource.Contains("小查还没找到数据"))
                    {
                        return false;
                    }
                }
                var block = JumonyParser.Parse(pageSource).Find(".m_srchList tbody tr td:nth-child(3)");
                foreach (var item in block)
                {
                    await VertifyAsync(item.InnerHtml());
                }
                if (block.Count() < PageSize)
                {
                    return false;
                }
                return true;
            }
    

    2、结果截图

    3、需要开通vip账号

    4、过滑动验证码

  • 相关阅读:
    JS控制的几种页面跳转方式和传值
    文件管理
    文件:文件和文件夹
    上传文件
    购物车的例子
    使用ajax登录格式
    ajax 另外两种返回类型(json xml)
    省级三级联动
    thinkPHP--SQL连贯操作
    thinkPHP-空操作
  • 原文地址:https://www.cnblogs.com/Zdelta/p/14122308.html
Copyright © 2011-2022 走看看