zoukankan      html  css  js  c++  java
  • Python Selenium Chrome Headless 爬取企查查数据

      1 # -*- coding:utf-8 -*-
      2 import os, pymysql,csv,configparser,pickle
      3 from selenium import webdriver
      4 from user_agent import generate_user_agent
      5 
      6 
      7 global csvpath
      8 global companypath
      9 global cookiedumped,csvinited
     10 global debugmode
     11 global browser_loaded
     12 global export
     13 global chromedriver
     14 
     15 browser_loaded=0
     16 csvinited=0
     17 
     18 #读取配置文件
     19 config=configparser.RawConfigParser()
     20 config.read('config.cfg')
     21 debugmode=int(config.get("config",'debugmode'))
     22 cookiedumped=int(config.get("config",'cookiedumped'))
     23 csvpath=config.get("config",'csvpath')
     24 export=int(config.get("config",'export'))
     25 companypath=config.get("config",'companypath')
     26 chromedriver=config.get("config","chromedriver")
     27 
     28 
     29 import time
     30 def dur( op=None, clock=[time.time()] ):
     31   if op != None:
     32     duration = time.time() - clock[0]
     33     print ('%s finished. Duration %.6f seconds.' % (op, duration))
     34   clock[0] = time.time()
     35 
     36 def durt( op=None, clock=[time.time()] ):
     37   if op != None:
     38     duration = time.time() - clock[0]
     39     print ('%s finished. Duration %.6f seconds.' % (op, duration))
     40   clock[0] = time.time()
     41 
     42 def init_db():
     43     global CONNECTION
     44     CONNECTION = pymysql.connect("地址", "用户名", "密码", "数据库", use_unicode=True, charset="utf8")
     45 
     46 
     47 def close_db():
     48     CONNECTION.close()
     49 
     50 
     51 def init_web_driver(opt1=0):
     52     global DRIVER, browser_loaded
     53     user_agent = generate_user_agent()
     54     co = webdriver.ChromeOptions()
     55     # Chrome driver default setting under Windows OS
     56     co.add_argument('--disable-gpu')
     57 
     58     if opt1 == 0:
     59         # Set the Chrome in headless mode
     60         co.add_argument('--headless')
     61         # Disable images loading
     62     co.add_argument('blink-settings=imagesEnabled=false')
     63 
     64     # Add User-Agent Profile
     65     co.add_argument('--user-agent={}'.format(user_agent))
     66 
     67     # Initialize Chrome
     68     DRIVER = webdriver.Chrome(
     69         chrome_options=co,
     70         executable_path=chromedriver,
     71         service_log_path=os.path.devnull
     72     )
     73     browser_loaded=1
     74     print('Chrome process loaed.')
     75 
     76 
     77 def close_web_driver():
     78     DRIVER.quit()
     79 
     80 
     81 def spider_create_cookie():
     82     init_web_driver(debugmode)
     83     DRIVER.get('https://www.qichacha.com/user_login')
     84     DRIVER.find_element_by_xpath('//*[@id="verifyLoginPanel"]/div[1]/a').click()
     85     time.sleep(10)
     86     print(DRIVER.current_url)
     87     cookie = [item["name"] + "=" + item["value"] for item in DRIVER.get_cookies()]
     88     print('Cookies Loaded' + '/n' + cookie)
     89     pickle.dump(DRIVER.get_cookies(), open("cookies.pkl", "wb"))
     90     close_web_driver()
     91     browser_loaded=0
     92     print('Cookies created.')
     93 
     94 
     95 def write_csv(inputstr, filename='result.csv',opt='a+'):
     96 
     97     if filename.strip()=='':
     98          filename='result.csv'
     99     #with open(filename, 'a+',newline='') as f:
    100     with open(filename, opt, newline='') as f:
    101         writer = csv.writer(f, dialect='excel')
    102         writer.writerow(inputstr)
    103     f.close()
    104     print('CSV writed.')
    105 
    106 def init_csv():
    107     headline=['搜索项','企业名称', '电话', '官网', '地址', '注册资本', '实缴资本',
    108               '经营状态', '成立日期', '统一社会信用代码', '纳税人识别号',
    109               '注册号', '组织机构代码', '公司类型', '所属行业', '核准日期',
    110               '登记机关', '所属地区', '英文名', '曾用名', '经营方式', '人员规模',
    111               '营业期限', '企业地址','经营范围']
    112     write_csv(headline,csvpath,'w+')
    113     global csvinited
    114     csvinited=1
    115     print('Output CSV ready.')
    116 
    117 
    118 
    119 #def write_sql():
    120 
    121 
    122 
    123 def get_companylist(filename='company.csv'):
    124 
    125 
    126     company_list = []
    127     f = open(filename, 'r')
    128     # company_list=f.readlines()
    129     for line in f.readlines():
    130         company_list.append(line.replace('
    ', ''))
    131     return company_list
    132     print('Company list loaded.')
    133 
    134 def table_reduction(searchitem,table, opt=1):
    135     table_rows = table.find_elements_by_tag_name('tr')
    136 
    137     #table_rows = table.find_elements_by_tag_name('tr')
    138     query_result = []
    139     query_result.append(searchitem)
    140     # 企业名称:
    141     query_result.append(DRIVER.find_element_by_xpath('//*[@id="company-top"]/div/div[2]/div[1]/h1').text)
    142     # 电话:
    143     query_result.append(DRIVER.find_element_by_xpath('//*[@id="company-top"]/div[1]/div[2]/div[2]/span[1]/span[2]/span').text)
    144     # 官网:
    145     query_result.append(DRIVER.find_element_by_xpath('//*[@id="company-top"]/div[1]/div[2]/div[2]/span[3]').text)
    146     # 地址:
    147     query_result.append(DRIVER.find_element_by_xpath('//*[@id="company-top"]/div[1]/div[2]/div[3]/span[3]/a[1]').text)
    148 
    149     # 注册资本:
    150     query_result.append(table_rows[0].find_elements_by_tag_name('td')[1].text)
    151 
    152     # 实缴资本:
    153     query_result.append(table_rows[0].find_elements_by_tag_name('td')[3].text)
    154 
    155     # 经营状态:
    156     query_result.append(table_rows[1].find_elements_by_tag_name('td')[1].text)
    157 
    158     # 成立日期:
    159     query_result.append(table_rows[1].find_elements_by_tag_name('td')[3].text)
    160 
    161     # 统一社会信用代码:
    162     query_result.append(table_rows[2].find_elements_by_tag_name('td')[1].text)
    163 
    164     # 纳税人识别号:
    165     query_result.append(table_rows[2].find_elements_by_tag_name('td')[3].text)
    166 
    167     # 注册号:
    168     query_result.append(table_rows[3].find_elements_by_tag_name('td')[1].text)
    169 
    170     # 组织机构代码:
    171     query_result.append(table_rows[3].find_elements_by_tag_name('td')[3].text)
    172 
    173     # 公司类型:
    174     query_result.append(table_rows[4].find_elements_by_tag_name('td')[1].text)
    175 
    176     # 所属行业:
    177     query_result.append(table_rows[4].find_elements_by_tag_name('td')[3].text)
    178 
    179     # 核准日期:
    180     query_result.append(table_rows[5].find_elements_by_tag_name('td')[1].text)
    181 
    182     # 登记机关:
    183     query_result.append(table_rows[5].find_elements_by_tag_name('td')[3].text)
    184 
    185     # 所属地区:
    186     query_result.append(table_rows[6].find_elements_by_tag_name('td')[1].text)
    187 
    188     # 英文名:
    189     query_result.append(table_rows[6].find_elements_by_tag_name('td')[3].text)
    190 
    191     # 曾用名:
    192     query_result.append(table_rows[7].find_elements_by_tag_name('td')[1].text)
    193 
    194     # 经营方式:
    195     query_result.append(table_rows[7].find_elements_by_tag_name('td')[3].text)
    196 
    197     # 人员规模:
    198     query_result.append(table_rows[8].find_elements_by_tag_name('td')[1].text)
    199 
    200     # 营业期限:
    201     query_result.append(table_rows[8].find_elements_by_tag_name('td')[3].text)
    202 
    203     # 企业地址:
    204     query_result.append(table_rows[9].find_elements_by_tag_name('td')[1].text)
    205 
    206     # 注册资本:
    207     query_result.append(table_rows[10].find_elements_by_tag_name('td')[1].text)
    208 
    209 
    210 
    211     #if export == 1:  # Write in MYSQL
    212 
    213     if export == 0:  # Write in local csv
    214         write_csv(query_result,csvpath)
    215 
    216 #使用前获取Cookie
    217 def spider_create_cookie():
    218     init_web_driver(1)
    219     DRIVER.get('https://www.qichacha.com/user_login')
    220     DRIVER.find_element_by_xpath('//*[@id="verifyLoginPanel"]/div[1]/a').click()
    221     time.sleep(10)
    222     print(DRIVER.current_url)
    223     cookie = [item["name"] + "=" + item["value"] for item in DRIVER.get_cookies()]
    224     pickle.dump(DRIVER.get_cookies(), open("cookies.pkl", "wb"))
    225     print('Cookies loaded.')
    226     global cookiedumped,browser_loaded
    227     cookiedumped=1
    228     DRIVER.close()
    229     browser_loaded = 0
    230 def visit_webpage(company_name):
    231 
    232     '''
    233     Dump Logined Cookies
    234     '''
    235     if cookiedumped==0:
    236         spider_create_cookie()
    237     if browser_loaded==1:
    238         DRIVER.find_element_by_id("headerKey").send_keys(company_name)
    239         DRIVER.find_element_by_xpath('/html/body/header/div/form/div/div/span/button').click()
    240 
    241     if cookiedumped==1 and browser_loaded==0:
    242         init_web_driver(debugmode)
    243         DRIVER.get('https://www.qichacha.com/')
    244         cookies = pickle.load(open("cookies.pkl", "rb"))
    245         for cookie in cookies:
    246             DRIVER.add_cookie(cookie)
    247         DRIVER.find_element_by_id("searchkey").send_keys(company_name)
    248         DRIVER.find_element_by_id("V3_Search_bt").click()
    249 
    250 
    251 
    252 
    253     DRIVER.get(DRIVER.find_element_by_class_name("ma_h1").get_attribute("href"))
    254     table = DRIVER.find_element_by_xpath('//*[@id="Cominfo"]/table[2]')
    255     if csvinited==0:
    256         init_csv()
    257     table_reduction(company_name,table)
    258 def main():
    259     import array
    260     global companys
    261     filename = './log/'+str(time.strftime('%Y-%m-%d_%H-%M', time.localtime(time.time()))) + '_ERROR.log'
    262     fp = open(filename, 'a+')
    263     companys=[]
    264     companys=get_companylist(companypath)
    265     i=1
    266     amount = len(companys)
    267     for items in companys:
    268 
    269         try:
    270             dur()
    271             visit_webpage(items)
    272            # t=timeit(visit_webpage(items))
    273             dur(str(i)+' of '+str(amount)+' '+items)
    274             i=i+1
    275         except:
    276             print(items+' FAILED TO CATCH')
    277             fp.write(str(time.strftime('%Y-%m-%d_%H:%M:%S',time.localtime(time.time())))+' items '+'FAILED TO LOAD')
    278     fp.close()
    279 
    280 
    281 
    282 #
    283 #
    284 #
    285 #
    286 #
    287 
    288 
    289 
    290 if __name__ == '__main__':
    291     durt()
    292     main()
    293     DRIVER.close()
    294     DRIVER.quit()
    295     print(str(len(companys))+' items finieshed! ')
    296     durt('TOTALY')

    本地配置文件

    [config]
    debugmode=0
    cookiedumped=0
    csvpath=Result.csv
    companypath=CompanyList.txt
    chromedriver=.chromedriver.exe
    export=0
    [sqlcon]
    ip_port=
    username=
    pwd=
    dbnanme=

    本地企业列表

    CompanyList.txt,每行放置一个企业名称或统一信用代码

  • 相关阅读:
    jQuery中jsonp的跨域处理,no access-control-allow-origin,unexpected token
    doT中嵌套for循环的使用
    c++ new带括号和不带括号
    python装饰器之使用情景分析
    Python中classmethod与staticmethod区别
    python作用域 scope
    duck type鸭子类型
    EAFP和LBYL 两种防御性编程风格
    c++重载、覆盖和隐藏
    c++ 名字粉碎(name mangling)
  • 原文地址:https://www.cnblogs.com/bionexit/p/9120147.html
Copyright © 2011-2022 走看看