zoukankan      html  css  js  c++  java
  • 4-2.绕过登录的方式采集天眼查数据

    !!!!本节就是解决天眼查爬虫问题:!!!!(看完有建议和想法的话记得交流一下)

    主要思路---绕过验证,使用代理ip,间接的通过手机端mbaidu.com 通过字符串拼接的方式获取搜索天眼查网页数据。

    重点:

    1.这里我采用的是python3+selenium+chromedriver写的代码,主要问题就是爬虫程序执行起来速度慢,效率较低(部分有误未修改)。

    2.这个方式也是通过同事的建议采取的,既然不能从正面直接登录采集就间接的通过手机端接口获取网站页面信息。

    3.存在的缺点就是:针对少量数据的话可以使用此类方式,数据集量太大不合适。

             此类方式我总结一下啊,犹如像大海捞针实用性不强只是供大家借鉴(亲测)。

              代理ip的问题,需要解决ip代理被限制,切换ip(我用的是付费代理ip软件)。

    直接上代码:

      1 # coding:utf-8
      2 
      3 from selenium import webdriver
      4 from selenium.webdriver.common.keys import Keys
      5 from pymysql import connect
      6 from selenium.webdriver.chrome.options import Options
      7 from selenium.webdriver.support.ui import  WebDriverWait
      8 from  bs4 import BeautifulSoup
      9 import re
     10 import time
     11 import requests
     12 
     13 since = time.time()
     14 # 前台开启浏览器模式
     15 def openChrome():
     16     #设置无头浏览关闭浏览器打开界面
     17 
     18     option = webdriver.ChromeOptions()
     19     # 打开chrome浏览器
     20     # option.add_argument()
     21 
     22     # 设置无头浏览关闭浏览器打开界面
     23     # option.add_argument('--headless')
     24     # option.add_argument('--disable-gpu')
     25 
     26     #关闭图片加载
     27     prefs = {"profile.managed_default_content_settings.images": 2}
     28     option.add_experimental_option("prefs", prefs)
     29     # {"code": 0, "success": "true", "msg": "",
     30     #  "data": [{"IP": "123.245.11.149", "Port": 32317}, {"IP": "220.165.30.196", "Port": 29096},
     31     #           {"IP": "114.227.59.61", "Port": 24750}, {"IP": "117.68.144.63", "Port": 33782},
     32     #           {"IP": "113.25.175.30", "Port": 15188}, {"IP": "60.161.142.110", "Port": 11071},
     33     #           {"IP": "119.130.17.45", "Port": 36788}, {"IP": "116.55.182.73", "Port": 30259},
     34     #           {"IP": "182.126.16.190", "Port": 20560}, {"IP": "175.4.21.49", "Port": 27780}]}
     35     # 设置代理
     36     # option.add_argument("--proxy-server=http://60.13.50.74:4369")
     37     driver = webdriver.Chrome(chrome_options=option)
     38     return driver
     39 
     40 # 授权操作
     41 def operationAuth(driver,name):
     42 
     43     print("××××××××××××××××××××××××××××××××××××××××开始搜索采集数据××××××××××××××××××××××××××××××××××××××××")
     44     url = "https://m.baidu.com"
     45     driver.get(url)
     46     wait=WebDriverWait(driver,0.5)
     47     #设置等待时间,等待页面加载完成再执行操作
     48     input=driver.find_element_by_id('index-kw')
     49     input.send_keys(name)
     50     input.send_keys(Keys.ENTER)
     51     try:
     52      #点击进入所需链接
     53      # driver.find_element_by_xpath("//div[@class='result c-result'][1]/div[@class='c-container']/a[@class='c-blocka']/h3[@class='c-title c-gap-top-small wa-www-normal-title']/em").click()
     54      # driver.find_element_by_xpath("//div[@class='result c-result c-clk-recommend'][1]/div[@class='c-container']/a[@class='c-blocka']/h3[@class='c-title c-gap-top-small wa-www-normal-title']").click()
     55         driver.find_element_by_xpath("//div[@id='page']/div[@id='page-bd']/div[@id='results']/div[@class='result c-result'][1]/div[@class='c-container']").click()
     56     except:
     57         print("未找到相关数据!")
     58         print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!请注意!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
     59     #切换窗口
     60     windows = driver.window_handles
     61     driver.switch_to_window(windows[-1])
     62 
     63     # 获取页面元素,里面就包含了需要爬取的数据
     64     data = driver.page_source
     65 
     66     # print(data)
     67 
     68     # 将html对象转换为bs4对象
     69     soup = BeautifulSoup(data, 'lxml')
     70 
     71 
     72 
     73     #获取所有img标签
     74     try:
     75         img_list=soup.find_all("img")
     76         # print(img_list)
     77         src=img_list[2]
     78         # print(src)
     79 
     80         # < img alt = "广州奥利门窗有限公司" class ="img expand-img" data-src="https://img.tianyancha.com/logo/lll/0528aacd716b2a87b693d5932280338e.png@!f_200x200" erro-src="" / >
     81 
     82         #logo 图片
     83         logo_url=re.findall(r'"https://.*?"',str(src))
     84         # print(logo_url)
     85         url=logo_url[0]
     86         str1=re.compile('"')
     87         logo=re.sub(str1," ",url)
     88         # print(logo)
     89         # 去除图片链接前后空格
     90         logo1 = logo.strip()
     91         # print(logo1)
     92     except:
     93         logo1="-"
     94     print(logo1)
     95 
     96     # #获取所有img标签
     97     # try:
     98     #     img_list=soup.find_all("img")
     99     #     # print(img_list)
    100     #     src=img_list[2]
    101     #     # print(src)
    102     #
    103     #     # < img alt = "广州奥利门窗有限公司" class ="img expand-img" data-src="https://img.tianyancha.com/logo/lll/0528aacd716b2a87b693d5932280338e.png@!f_200x200" erro-src="" / >
    104     #
    105     #     #logo 图片
    106     #     logo_url=re.findall(r'"https://.*?"',str(src))
    107     #     # print(logo_url)
    108     #     url=logo_url[0]
    109     #     str1=re.compile('"')
    110     #     logo=re.sub(str1," ",url)
    111     #     # print(logo)
    112     #     # 去除图片链接前后空格
    113     #     logo1 = logo.strip()
    114     #     print(logo1)
    115     #     log_pic(logo=logo1, name=search_name)
    116     #     logo2 = "./log_pics/" + search_name + ".jpg" + "   url:" +logo1
    117     # except:
    118     #     logo2="无数据"
    119     # print(logo2)
    120     # urllib.urlretrieve(logo,"log_pics\%s.jpg"%name)
    121     try:
    122         # 公司名称
    123         name1 = driver.find_element_by_xpath("//div[@class='box']/div[@class='content']/div[@class='header']/h1[@class='name']").text
    124         company = name1
    125     except:
    126         company = '无数据'
    127     # print("公司名称:"+company)
    128 
    129 
    130     #获取所有span标签
    131     span_lists=soup.find_all("span")
    132     # print(span_lists)
    133 
    134     try:
    135         # 获取所有span标签
    136         span_lists = soup.find_all("span")
    137         # print(span_lists)
    138         #电话
    139         data_phone1=span_lists[8]
    140         data_phone2=span_lists[9]
    141         # print(data_phone)
    142         # < span class ="pl10" > < script type="text/html" >["0757-85758083", "85756656"] < / script > < span class ="link-click ml10" onclick="openPhonePopup(this)" > 查看更多 < / span > < / span >
    143 
    144         phone=""
    145         # phones=re.findall(r"d{4}-d{8}|d{8}",str(data_phone))
    146         # phones=re.findall(r"d{4}-d{7}|d{7}|d{4}-d{8}|d{8}",str(data_phone))
    147         phones1=re.findall(r"d{4}-d{7,8}|d{3}-d{8}|d{11}|d{8}",str(data_phone1))
    148 
    149         for phone_num1 in phones1:
    150             phone=phone_num1+"  "+phone
    151 
    152         phones2=re.findall(r"d{4}-d{7,8}|d{3}-d{8}|d{11}|d{8}",str(data_phone2))
    153         for phone_num2 in phones2:
    154             phone=phone_num2+"  "+phone
    155     except:
    156         phone="-"
    157     # print("电话:"+phone)
    158 
    159 
    160     try:
    161         # 邮箱
    162         data_email1=span_lists[12]
    163         data_email2=span_lists[13]
    164         # print(data_email1)
    165         # print(data_email2)
    166         email=""
    167         emails1=re.findall(r"[w!#$%&'*+/=?^_`{|}~-]+(?:.[w!#$%&'*+/=?^_`{|}~-]+)*@(?:[w](?:[w-]*[w])?.)+[w](?:[w-]*[w])?",str(data_email1))
    168         for em1 in emails1:
    169             email=em1+"  "+email
    170         emails2=re.findall(r"[w!#$%&'*+/=?^_`{|}~-]+(?:.[w!#$%&'*+/=?^_`{|}~-]+)*@(?:[w](?:[w-]*[w])?.)+[w](?:[w-]*[w])?",str(data_email2))
    171         for em2 in emails2:
    172             email=em2+" "+email
    173     except:
    174         email="-"
    175     # print("邮箱:"+email)
    176 
    177 
    178     try:
    179      # 网址
    180      page=driver.find_element_by_xpath("//div[@class='detail ']/div[@class='f0'][2]/div[@class='in-block'][1]/span[2]").text
    181     except:
    182         page="-"
    183     # print("网址:"+page)
    184 
    185 
    186     try:
    187         #简介
    188         content=driver.find_element_by_xpath("//div[@class='content']/div[@class='detail ']/div[@class='summary']/span[2]").text
    189     except:
    190         content="-"
    191     # print("简介:"+content)
    192 
    193 
    194 
    195     try:
    196         # 法定代表人
    197         represent =driver.find_element_by_xpath("//div[1]/div[@class='humancompany']/div[@class='name']/a[@class='link-click']").text
    198     except:
    199         represent = "-"
    200     # print("法定代表人:"+represent)
    201 
    202     try:
    203         #注册时间
    204         register_time1=driver.find_element_by_xpath("//table[@class='table']/tbody/tr[2]/td/div[2]/text[@class='tyc-num lh24']").text
    205         year = register_time1[:4]
    206         y = trans(year)
    207         mon = register_time1[5:7]
    208         m = trans(mon)
    209         day =register_time1[8:10]
    210         d = trans(day)
    211         register_time=y+'-'+m+'-'+d
    212     except:
    213         register_time="-"
    214 
    215     try:
    216         # 注册号
    217         register_num = driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[1]/td[2]").text
    218     except:
    219         register_num = '-'
    220     # print("注册号:"+register_num)
    221 
    222 
    223     try:
    224         # 组织机构代码
    225         code = driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[1]/td[4]").text
    226     except:
    227         code = '-'
    228     # print("组织机构代码:"+code)
    229 
    230 
    231     try:
    232         # 统一信用代码
    233         social_code=driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[2]/td[2]").text
    234     except:
    235         social_code="-"
    236     # print("统一信用代码:"+social_code)
    237 
    238 
    239     try:
    240         # 公司类型
    241         company_type = driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[2]/td[4]").text
    242     except:
    243         company_type = '-'
    244     # print("公司类型:"+company_type)
    245 
    246 
    247     try:
    248         # 所属行业
    249         trade = driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[3]/td[4]").text
    250     except:
    251         trade = '-'
    252     # print("所属行业:"+trade)
    253 
    254 
    255     try:
    256         #营业期限
    257         deadline = driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[4]/td[2]").text
    258     except:
    259         deadline = "-"
    260     # print("营业期限:"+deadline)
    261 
    262     try:
    263         #核准日期
    264         right_day1=driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']/tbody/tr[4]/td[4]/text[@class='tyc-num lh24']").text
    265         year = right_day1[:4]
    266         y = trans(year)
    267         mon = right_day1[5:7]
    268         m = trans(mon)
    269         day =right_day1[8:10]
    270         d = trans(day)
    271         right_day=y+'-'+m+'-'+d
    272     except:
    273         right_day='-'
    274 
    275     try:
    276         #纳税人资质
    277         qualification=driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[5]/td[2]").text
    278     except:
    279         qualification="-"
    280     # print("纳税人资质:"+qualification)
    281 
    282 
    283     try:
    284         #人员规模
    285         pscale=driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[5]/td[4]").text
    286     except:
    287         pscale="-"
    288     # print("人员规模:"+pscale)
    289 
    290 
    291     try:
    292         #实缴资本
    293         paid=driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[6]/td[2]").text
    294     except:
    295         paid="-"
    296     # print("实缴资本:"+paid)
    297 
    298 
    299     try:
    300         # 登记机关
    301         registration_authority = driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[6]/td[4]").text
    302     except:
    303         registration_authority = '-'
    304     # print("登记机关:"+registration_authority)
    305 
    306 
    307     try:
    308         #参保人数
    309         Insured_number=driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[7]/td[2]").text
    310     except:
    311         Insured_number="-"
    312     # print("参保人数:"+Insured_number)
    313 
    314 
    315     try:
    316         #英文名称
    317         E_name=driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[7]/td[4]").text
    318     except:
    319         E_name="-"
    320     # print("英文名称"+E_name)
    321 
    322 
    323     try:
    324         # 地址
    325         addr =driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[8]/td[2]").text[:-4]
    326     except:
    327         addr = "-"
    328     # print("地址:"+addr)
    329 
    330 
    331     try:
    332         # 经营范围
    333         scope = driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[9]/td[2]/span[@class='select-none']/span[@class='js-shrink-container']/span[@class='js-split-container']/span[@class='tyc-num']/text[@class='tyc-num lh24']").text
    334     except:
    335         scope = "-"
    336     # print("经营范围:"+scope)
    337 
    338     # print(company)
    339     # print(phone)
    340     # print(email)
    341     # print(page)
    342     # print(content)
    343     # print(represent)
    344     # print(register_time)
    345     # print(register_num)
    346     # print(code)
    347     # print(social_code)
    348     # print(company_type)
    349     # print(trade)
    350     # print(deadline)
    351     # print(right_day)
    352     # print(qualification)
    353     # print(pscale)
    354     # print(paid)
    355     # print(registration_authority)
    356     # print(Insured_number)
    357     # print(E_name)
    358     # print(addr)
    359     # print(scope)
    360 
    361     print("××××××××××××××××××××××××××××××××××××××××开始入库××××××××××××××××××××××××××××××××××××××××")
    362     #将数据存入数据库
    363     conn = connect(host="192.168.113.129",
    364                    port=3306,
    365                    database="datas",
    366                    user="root",
    367                    password="123456",
    368                    charset="utf8")
    369 
    370     cursor = conn.cursor()
    371     try:
    372         insertsql = "insert into tianyancha_datas_test3(search_name,company,logo,phone,email,page,content,represent,register_time,register_num,code,social_code,company_type,trade,deadline,right_day,qualification,pscale,paid,registration_authority,Insured_number,E_name,addr,scope)" 
    373                     "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    374         cursor.execute(insertsql,[search_name,company,logo1,phone,email,page,content,represent,register_time,register_num,code,social_code,company_type,trade,deadline,right_day,qualification,pscale,paid,registration_authority,Insured_number,E_name,addr,scope])
    375         print("————————————————————————————————————————入库成功————————————————————————————————————————")
    376     except:
    377         print('————————————————————————————————————————入库失败————————————————————————————————————————')
    378     cursor.close()
    379     conn.commit()
    380     conn.close()
    381 
    382 #字符转换
    383 def  trans(A):
    384     num=[]
    385     for i in range (0,len(A)):
    386         if A[i]=="0":
    387             zero=re.compile("0")
    388             z=re.sub(zero,"5",A[i])
    389             num.append(z)
    390 
    391         elif A[i]=="1":
    392             one=re.compile("1")
    393             o=re.sub(one,"2",A[i])
    394             # print(o,i)
    395             num.append(o)
    396 
    397         elif A[i]=="2":
    398             two=re.compile("2")
    399             t=re.sub(two,"9",A[i])
    400             # print(t,i)
    401             num.append(t)
    402 
    403         elif A[i]=="3":
    404             three=re.compile("3")
    405             t2=re.sub(three,"0",A[i])
    406             # print(t2,i)
    407             num.append(t2)
    408 
    409         elif A[i]=="4":
    410             four=re.compile("4")
    411             f=re.sub(four,"6",A[i])
    412             # print(f,i)
    413             num.append(f)
    414 
    415         elif A[i]=="5":
    416             five=re.compile("5")
    417             f2=re.sub(five,"4",A[i])
    418             # print(f2,i)
    419             num.append(f2)
    420 
    421         elif A[i]=="6":
    422             six=re.compile("6")
    423             s=re.sub(six,"7",A[i])
    424             # print(s,i)
    425             num.append(s)
    426 
    427         elif A[i]=="7":
    428             seven=re.compile("7")
    429             s2=re.sub(seven,"3",A[i])
    430             # print(s2,i)
    431             num.append(s2)
    432 
    433         elif A[i]=="8":
    434             eight=re.compile("8")
    435             e=re.sub(eight,"1",A[i])
    436             # print(e,i)
    437             num.append(e)
    438 
    439         elif A[i]=="9":
    440             nine=re.compile("9")
    441             n=re.sub(nine,"8",A[i])
    442             # print(n,i)
    443             num.append(n)
    444     # print(num)
    445 
    446     number=''.join(num)
    447     return number
    448 
    449 
    450 #存到本地,暂未设置
    451 def log_pic(logo,name):
    452     html = requests.get(logo)
    453     with open('./log_pics/'+name+'.jpg','wb') as file:
    454         file.write(html.content)
    455 
    456 
    457 
    458 # 方法主入口
    459 if __name__ == '__main__':
    460     # 加启动配置
    461     driver = openChrome()
    462    list1 = ['长沙国盛动力设备有限公司', '长沙大禹建筑防水工程有限公司株洲分公司', '长沙大运金属材料有限公司']
    463     for i in range(len(list1)):
    464         name = list1[i]
    465         print('*' * 100)
    466 
    467         print(name)
    468         search_name=name
    469         # operationAuth(driver, name=list1[i]+" 信用信息_诉讼信息_财务")
    470         # operationAuth(driver, name=list1[i]+"_【信用信息_诉讼信息_财务 site:www.tianyancha.com")
    471         operationAuth(driver, name=list1[i]+"_【信用信息_诉讼信息_财务信息_...")
    472     time_elapsed=time.time()-since
    473     print('Training complete in {:.0f}m {:.0f}s'.format(
    474         time_elapsed // 60, time_elapsed % 60))







  • 相关阅读:
    吹气球
    Leetcode 235
    什么是BPMN网关?
    BPMN中的任务和活动之间有什么区别?
    两款流程图设计推荐
    Activiti7.1, jBPM7.25, Camunda, Flowable6.3技术组成对比
    Flowable与activiti对比
    机器学习中的数学
    WopiServerTutorial
    如何整合Office Web Apps至自己开发的系统(二)
  • 原文地址:https://www.cnblogs.com/lvjing/p/9584467.html
Copyright © 2011-2022 走看看