1 # -*- coding:utf-8 -*- 2 import os, pymysql,csv,configparser,pickle 3 from selenium import webdriver 4 from user_agent import generate_user_agent 5 6 7 global csvpath 8 global companypath 9 global cookiedumped,csvinited 10 global debugmode 11 global browser_loaded 12 global export 13 global chromedriver 14 15 browser_loaded=0 16 csvinited=0 17 18 #读取配置文件 19 config=configparser.RawConfigParser() 20 config.read('config.cfg') 21 debugmode=int(config.get("config",'debugmode')) 22 cookiedumped=int(config.get("config",'cookiedumped')) 23 csvpath=config.get("config",'csvpath') 24 export=int(config.get("config",'export')) 25 companypath=config.get("config",'companypath') 26 chromedriver=config.get("config","chromedriver") 27 28 29 import time 30 def dur( op=None, clock=[time.time()] ): 31 if op != None: 32 duration = time.time() - clock[0] 33 print ('%s finished. Duration %.6f seconds.' % (op, duration)) 34 clock[0] = time.time() 35 36 def durt( op=None, clock=[time.time()] ): 37 if op != None: 38 duration = time.time() - clock[0] 39 print ('%s finished. Duration %.6f seconds.' % (op, duration)) 40 clock[0] = time.time() 41 42 def init_db(): 43 global CONNECTION 44 CONNECTION = pymysql.connect("地址", "用户名", "密码", "数据库", use_unicode=True, charset="utf8") 45 46 47 def close_db(): 48 CONNECTION.close() 49 50 51 def init_web_driver(opt1=0): 52 global DRIVER, browser_loaded 53 user_agent = generate_user_agent() 54 co = webdriver.ChromeOptions() 55 # Chrome driver default setting under Windows OS 56 co.add_argument('--disable-gpu') 57 58 if opt1 == 0: 59 # Set the Chrome in headless mode 60 co.add_argument('--headless') 61 # Disable images loading 62 co.add_argument('blink-settings=imagesEnabled=false') 63 64 # Add User-Agent Profile 65 co.add_argument('--user-agent={}'.format(user_agent)) 66 67 # Initialize Chrome 68 DRIVER = webdriver.Chrome( 69 chrome_options=co, 70 executable_path=chromedriver, 71 service_log_path=os.path.devnull 72 ) 73 browser_loaded=1 74 print('Chrome process loaed.') 75 76 77 def close_web_driver(): 78 DRIVER.quit() 79 80 81 def spider_create_cookie(): 82 init_web_driver(debugmode) 83 DRIVER.get('https://www.qichacha.com/user_login') 84 DRIVER.find_element_by_xpath('//*[@id="verifyLoginPanel"]/div[1]/a').click() 85 time.sleep(10) 86 print(DRIVER.current_url) 87 cookie = [item["name"] + "=" + item["value"] for item in DRIVER.get_cookies()] 88 print('Cookies Loaded' + '/n' + cookie) 89 pickle.dump(DRIVER.get_cookies(), open("cookies.pkl", "wb")) 90 close_web_driver() 91 browser_loaded=0 92 print('Cookies created.') 93 94 95 def write_csv(inputstr, filename='result.csv',opt='a+'): 96 97 if filename.strip()=='': 98 filename='result.csv' 99 #with open(filename, 'a+',newline='') as f: 100 with open(filename, opt, newline='') as f: 101 writer = csv.writer(f, dialect='excel') 102 writer.writerow(inputstr) 103 f.close() 104 print('CSV writed.') 105 106 def init_csv(): 107 headline=['搜索项','企业名称', '电话', '官网', '地址', '注册资本', '实缴资本', 108 '经营状态', '成立日期', '统一社会信用代码', '纳税人识别号', 109 '注册号', '组织机构代码', '公司类型', '所属行业', '核准日期', 110 '登记机关', '所属地区', '英文名', '曾用名', '经营方式', '人员规模', 111 '营业期限', '企业地址','经营范围'] 112 write_csv(headline,csvpath,'w+') 113 global csvinited 114 csvinited=1 115 print('Output CSV ready.') 116 117 118 119 #def write_sql(): 120 121 122 123 def get_companylist(filename='company.csv'): 124 125 126 company_list = [] 127 f = open(filename, 'r') 128 # company_list=f.readlines() 129 for line in f.readlines(): 130 company_list.append(line.replace(' ', '')) 131 return company_list 132 print('Company list loaded.') 133 134 def table_reduction(searchitem,table, opt=1): 135 table_rows = table.find_elements_by_tag_name('tr') 136 137 #table_rows = table.find_elements_by_tag_name('tr') 138 query_result = [] 139 query_result.append(searchitem) 140 # 企业名称: 141 query_result.append(DRIVER.find_element_by_xpath('//*[@id="company-top"]/div/div[2]/div[1]/h1').text) 142 # 电话: 143 query_result.append(DRIVER.find_element_by_xpath('//*[@id="company-top"]/div[1]/div[2]/div[2]/span[1]/span[2]/span').text) 144 # 官网: 145 query_result.append(DRIVER.find_element_by_xpath('//*[@id="company-top"]/div[1]/div[2]/div[2]/span[3]').text) 146 # 地址: 147 query_result.append(DRIVER.find_element_by_xpath('//*[@id="company-top"]/div[1]/div[2]/div[3]/span[3]/a[1]').text) 148 149 # 注册资本: 150 query_result.append(table_rows[0].find_elements_by_tag_name('td')[1].text) 151 152 # 实缴资本: 153 query_result.append(table_rows[0].find_elements_by_tag_name('td')[3].text) 154 155 # 经营状态: 156 query_result.append(table_rows[1].find_elements_by_tag_name('td')[1].text) 157 158 # 成立日期: 159 query_result.append(table_rows[1].find_elements_by_tag_name('td')[3].text) 160 161 # 统一社会信用代码: 162 query_result.append(table_rows[2].find_elements_by_tag_name('td')[1].text) 163 164 # 纳税人识别号: 165 query_result.append(table_rows[2].find_elements_by_tag_name('td')[3].text) 166 167 # 注册号: 168 query_result.append(table_rows[3].find_elements_by_tag_name('td')[1].text) 169 170 # 组织机构代码: 171 query_result.append(table_rows[3].find_elements_by_tag_name('td')[3].text) 172 173 # 公司类型: 174 query_result.append(table_rows[4].find_elements_by_tag_name('td')[1].text) 175 176 # 所属行业: 177 query_result.append(table_rows[4].find_elements_by_tag_name('td')[3].text) 178 179 # 核准日期: 180 query_result.append(table_rows[5].find_elements_by_tag_name('td')[1].text) 181 182 # 登记机关: 183 query_result.append(table_rows[5].find_elements_by_tag_name('td')[3].text) 184 185 # 所属地区: 186 query_result.append(table_rows[6].find_elements_by_tag_name('td')[1].text) 187 188 # 英文名: 189 query_result.append(table_rows[6].find_elements_by_tag_name('td')[3].text) 190 191 # 曾用名: 192 query_result.append(table_rows[7].find_elements_by_tag_name('td')[1].text) 193 194 # 经营方式: 195 query_result.append(table_rows[7].find_elements_by_tag_name('td')[3].text) 196 197 # 人员规模: 198 query_result.append(table_rows[8].find_elements_by_tag_name('td')[1].text) 199 200 # 营业期限: 201 query_result.append(table_rows[8].find_elements_by_tag_name('td')[3].text) 202 203 # 企业地址: 204 query_result.append(table_rows[9].find_elements_by_tag_name('td')[1].text) 205 206 # 注册资本: 207 query_result.append(table_rows[10].find_elements_by_tag_name('td')[1].text) 208 209 210 211 #if export == 1: # Write in MYSQL 212 213 if export == 0: # Write in local csv 214 write_csv(query_result,csvpath) 215 216 #使用前获取Cookie 217 def spider_create_cookie(): 218 init_web_driver(1) 219 DRIVER.get('https://www.qichacha.com/user_login') 220 DRIVER.find_element_by_xpath('//*[@id="verifyLoginPanel"]/div[1]/a').click() 221 time.sleep(10) 222 print(DRIVER.current_url) 223 cookie = [item["name"] + "=" + item["value"] for item in DRIVER.get_cookies()] 224 pickle.dump(DRIVER.get_cookies(), open("cookies.pkl", "wb")) 225 print('Cookies loaded.') 226 global cookiedumped,browser_loaded 227 cookiedumped=1 228 DRIVER.close() 229 browser_loaded = 0 230 def visit_webpage(company_name): 231 232 ''' 233 Dump Logined Cookies 234 ''' 235 if cookiedumped==0: 236 spider_create_cookie() 237 if browser_loaded==1: 238 DRIVER.find_element_by_id("headerKey").send_keys(company_name) 239 DRIVER.find_element_by_xpath('/html/body/header/div/form/div/div/span/button').click() 240 241 if cookiedumped==1 and browser_loaded==0: 242 init_web_driver(debugmode) 243 DRIVER.get('https://www.qichacha.com/') 244 cookies = pickle.load(open("cookies.pkl", "rb")) 245 for cookie in cookies: 246 DRIVER.add_cookie(cookie) 247 DRIVER.find_element_by_id("searchkey").send_keys(company_name) 248 DRIVER.find_element_by_id("V3_Search_bt").click() 249 250 251 252 253 DRIVER.get(DRIVER.find_element_by_class_name("ma_h1").get_attribute("href")) 254 table = DRIVER.find_element_by_xpath('//*[@id="Cominfo"]/table[2]') 255 if csvinited==0: 256 init_csv() 257 table_reduction(company_name,table) 258 def main(): 259 import array 260 global companys 261 filename = './log/'+str(time.strftime('%Y-%m-%d_%H-%M', time.localtime(time.time()))) + '_ERROR.log' 262 fp = open(filename, 'a+') 263 companys=[] 264 companys=get_companylist(companypath) 265 i=1 266 amount = len(companys) 267 for items in companys: 268 269 try: 270 dur() 271 visit_webpage(items) 272 # t=timeit(visit_webpage(items)) 273 dur(str(i)+' of '+str(amount)+' '+items) 274 i=i+1 275 except: 276 print(items+' FAILED TO CATCH') 277 fp.write(str(time.strftime('%Y-%m-%d_%H:%M:%S',time.localtime(time.time())))+' items '+'FAILED TO LOAD') 278 fp.close() 279 280 281 282 # 283 # 284 # 285 # 286 # 287 288 289 290 if __name__ == '__main__': 291 durt() 292 main() 293 DRIVER.close() 294 DRIVER.quit() 295 print(str(len(companys))+' items finieshed! ') 296 durt('TOTALY')
本地配置文件
[config] debugmode=0 cookiedumped=0 csvpath=Result.csv companypath=CompanyList.txt chromedriver=.chromedriver.exe export=0 [sqlcon] ip_port= username= pwd= dbnanme=
本地企业列表
CompanyList.txt,每行放置一个企业名称或统一信用代码