zoukankan      html  css  js  c++  java
  • 爬虫—启新宝接口函数

    from selenium import webdriver
    import time
    import re
    from bs4 import BeautifulSoup


    #************************************************************************
    #定义login函数
    def login(username,password):
    driver = webdriver.Firefox(executable_path='C:/Program Files (x86)/Mozilla Firefox/firefox.exe')
    driver.get('http://www.qixin.com/login')
    time.sleep(2)
    driver.maximize_window()
    driver.find_element_by_link_text('QQ登录').click()
    driver.switch_to_frame('ptlogin_iframe')#切换到iframe框架
    driver.find_element_by_link_text('帐号密码登录').click()
    driver.find_element_by_id('u').clear()
    driver.find_element_by_id('u').send_keys(username)#需要更换的QQ账号
    driver.find_element_by_id('p').send_keys(password)#对应QQ账号的密码密码
    driver.find_element_by_class_name('login_button').click()
    time.sleep(5)
    return driver
    #************************************************************************
    #获得搜索结果页面
    def search_ename(driver,i):
    #搜索企业名称并抓取该url地址
    url = 'http://www.qixin.com/search?key='+i+'&type=enterprise&method=all'
    print('正在抓取:'+url)
    driver.get(url)
    return driver
    #************************************************************************
    #从搜索结果中提取出企业链接(只提取第一个)
    def get_enameurl(driver):
    #利用BeautifulSoup匹配公司名
    soup = BeautifulSoup(driver.page_source)
    sfa = soup.find_all(href=re.compile('/company/gongsi'))
    if len(sfa)!=0:
    eurl = 'http://www.qixin.com'+sfa[0]['href']
    else:
    eurl = '不存在'
    return eurl
    #************************************************************************
    #定义匹配基本信息函数
    def get_basic_info(soup):
    #匹配工商基本信息########################
    a = soup.find_all(class_="panel panel-default basic-info")
    #s用来存储企业工商基本信息
    s = ''
    #将工商基本信息中各个字段的值匹配出来存储进s
    for i in a[0].find_all('span')[1:]:
    s = s+' '+i.string

    #因为法人代表字段是链接,所以不能用以上方式匹配,下面是匹配法人代表的方式
    #判断法人代表字段是否为空,不为空执行以下操作
    if len(soup.find_all(href=re.compile('search')))!=0:
    #匹配法人代表的值
    faren = soup.find_all(href=re.compile('search'))[0].string
    #判断法人代表的值类型是否是str(不是str就是NoneType类型,无法与s进行相加)
    if isinstance(faren,str):
    s = faren + s
    else:
    s = '-' + s
    #法人代表字段如果为空,执行以下操作
    else:
    faren = '-'
    s = fanren + s
    #将企业名称存储进s(首先判断企业名称是否为空)
    try:
    qiyemingcheng = soup.title.string.split('|')[0]
    except:
    qiyemingcheng = '-'

    s = qiyemingcheng + ' ' + s +' '

    #匹配股东信息和主要人员信息
    a = soup.find_all(class_='panel panel-default',style="margin-top:0")
    #匹配股东信息########################
    #不是链接的股东信息
    gudonglist = a[0].find_all('span',class_=None)
    if len(gudonglist)!=0:
    for gudong in gudonglist:
    s = s+gudong.string+' '
    #是链接的股东信息
    gudonglist2 = a[0].find_all('a')
    if len(gudonglist2)!=0:
    for gudong in gudonglist2:
    s=s+gudong.string+' '

    if len(gudonglist)==0 and len(gudonglist2)==0:
    s=s+'-'
    s=s+' '
    #匹配主要人员信息########################
    bosslist = a[1].find_all(class_='enterprise-employee-name')
    if len(bosslist)!=0:
    for boss in bosslist:
    s=s+boss.string+' '
    else:
    s=s+'-'
    s=s+' '
    #将s写入文本文件
    print('成功写入基本信息数据 ************** ')
    return s
    #************************************************************************
    #定义匹配风险信息函数
    def get_risk_info(soup):
    a=soup.find_all(class_="risk-list-item")
    s=''
    #匹配工商变更信息########################
    changerecord = soup.find(id="changerecord").find_all(class_='risk-list-item')
    if len(changerecord)!=0:
    for i in changerecord:
    cr = i.get_text()
    cr = cr.replace(' ','')
    cr = cr.replace(' ','')
    s=s+cr+' '
    else:
    s=s+'-'
    s=s+' '
    #匹配法院判决信息########################
    lawsuit = soup.find(id="lawsuit").find_all(class_='risk-list-item')
    if len(lawsuit)!=0:
    for i in lawsuit:
    ls = i.get_text()
    ls = ls.replace(' ','')
    ls = ls.replace(' ','')
    s=s+ls+' '
    else:
    s=s+'-'
    s=s+' '
    #匹配法院公告信息########################
    notice = soup.find(id="notice").find_all(class_='risk-list-item')
    if len(notice)!=0:
    for i in notice:
    nt = i.get_text()
    nt = nt.replace(' ','')
    nt = nt.replace(' ','')
    s=s+nt+' '
    else:
    s=s+'-'
    s=s+' '
    #匹配被执行人信息########################
    executionperson = soup.find(id="executionperson").find_all(class_='risk-list-item')
    if len(executionperson)!=0:
    for i in executionperson:
    ep = i.get_text()
    ep = ep.replace(' ','')
    ep = ep.replace(' ','')
    s=s+ep+' '
    else:
    s=s+'-'
    s=s+' '
    #匹配失信人信息########################
    dishonest = soup.find(id="dishonest").find_all(class_='risk-list-item')
    if len(dishonest)!=0:
    for i in dishonest:
    ds = i.get_text()
    ds = ds.replace(' ','')
    ds = ds.replace(' ','')
    s=s+ds+' '
    else:
    s=s+'-'
    s=s+' '
    #匹配司法拍卖信息########################
    sfpm = soup.find(id="sfpm").find_all(class_='risk-list-item')
    if len(sfpm)!=0:
    for i in sfpm:
    sf = i.get_text()
    sf = sf.replace(' ','')
    sf = sf.replace(' ','')
    s=s+sf+' '
    else:
    s=s+'-'
    s=s+' '
    #匹配经营异常信息########################
    abnormal = soup.find(id="abnormal").find_all(class_='risk-list-item')
    if len(abnormal)!=0:
    for i in abnormal:
    ab = i.get_text()
    ab = ab.replace(' ','')
    ab = ab.replace(' ','')
    s=s+ab+' '
    else:
    s=s+'-'
    s=s+' '
    #将s写入文本文件
    print('成功写入风险信息数据 ************** ')
    return s
    #************************************************************************
    #定义匹配对外投资函数
    def get_investment_info(soup):
    a = soup.find_all(class_="search-result-title" )
    s=''
    if len(a)!=0:
    for i in a:
    s=s+i.get_text()+' '
    else:
    s=s+'-'
    s=s+' '
    #将s写入文本文件
    print('成功写入对外投资数据 ************** ')
    return s
    #************************************************************************
    #定义匹配企业年报函数
    def get_report_info(soup):
    s=''
    #匹配企业基本信息########################
    a=soup.find(class_="panel-body report-info info-part")
    info = a.find_all('span')
    for i in info:
    infovalue = i.get_text()
    if infovalue=='':
    infovalue='-'
    s=s+infovalue+' '
    s=s+' '

    #匹配企业资产状况信息########################
    a = soup.find(class_="table table-bordered table-striped",style="margin-top:10px;")
    companyfinance = a.find_all('td')
    for i in companyfinance:
    cfvalue = i.get_text()
    if cfvalue=='':
    cfvalue='-'
    s=s+cfvalue+' '
    s=s+' '
    #将s写入文本文件
    print('成功写入企业年报数据 ************** ')
    return s
    #************************************************************************
    #抓取企业链接(eurl)关联的网页,匹配出需要的字段,并写入文本文件
    def get_evalue(driver,eurl):
    s=''
    #抓取企业链接
    print('正在抓取'+eurl)
    driver.get(eurl)

    #将页面源码与BeautifulSoup关联
    soup = BeautifulSoup(driver.page_source)

    #抓取基本信息
    s = s + get_basic_info(soup)

    #抓取风险信息,首先点击切换到风险信息栏
    try:
    driver.find_element_by_xpath("//a[@href='#risk']").click()
    time.sleep(1)
    soup = BeautifulSoup(driver.page_source)
    s = s + get_risk_info(soup)
    except:
    s = s + '- - - - - - - '

    #抓取对外投资信息,首先点击切换到对外投资信息栏
    try:
    driver.find_element_by_xpath("//a[@href='#investment']").click()
    time.sleep(1)
    soup = BeautifulSoup(driver.page_source)
    s = s + get_investment_info(soup)
    except:
    s = s + '- '

    #抓取企业年报信息,首先点击切换到企业年报信息栏
    try:
    driver.find_element_by_xpath("//a[@href='#report']").click()
    time.sleep(1)
    soup = BeautifulSoup(driver.page_source)
    s = s + get_report_info(soup)
    except:
    s = s + '- - '

    return s
    #************************************************************************
    def get_enterprise_data(driver,ename):
    #获得搜索结果页面
    driver = search_ename(driver,ename)
    #从搜索结果中提取出企业链接
    eurl = get_enameurl(driver)
    #抓取企业链接(eurl)关联的网页,匹配出需要的字段
    if eurl != '不存在':
    data = get_evalue(driver,eurl)
    else:
    soup = BeautifulSoup(driver.page_source)
    try:
    qiyemingcheng = soup.title.string.split('|')[0]
    except:
    qiyemingcheng = '-'

    data = qiyemingcheng+','+'该企业不存在 '
    print('成功写入数据 ************** ')
    sdata = data.split(' ')
    endata = sdata[0].split(' ')

    return endata
    #************************************************************************

    driver = login('QQ账号','QQ密码') #登录函数 只需要登录一次
    data = get_enterprise_data(driver,'企业名称') #获得企业信息函数,登录之后可以多次调用


  • 相关阅读:
    LeetCode120 Triangle
    LeetCode119 Pascal's Triangle II
    LeetCode118 Pascal's Triangle
    LeetCode115 Distinct Subsequences
    LeetCode114 Flatten Binary Tree to Linked List
    LeetCode113 Path Sum II
    LeetCode112 Path Sum
    LeetCode111 Minimum Depth of Binary Tree
    Windows下搭建PHP开发环境-WEB服务器
    如何发布可用于azure的镜像文件
  • 原文地址:https://www.cnblogs.com/wyj690/p/5386984.html
Copyright © 2011-2022 走看看