下面就是是源代码,我是直接把数据保存在字典并且打印到控制台的
1 # -*- coding: utf-8 -*- 2 from lxml import etree 3 import requests,re,random 4 import time,redis 5 from myweb.yichewang import user_agent 6 import urllib.request 7 from concurrent.futures import ThreadPoolExecutor 8 9 import pymysql 10 from datetime import datetime 11 from wxpy import * 12 #bot = Bot(cache_path=True)#微信端监控运行情况 13 14 def run_time(func): 15 def wrap(*arg,**kwargs): 16 start_time = time.time() 17 func(*arg,**kwargs) 18 print('获取车型链接类方法运行时间为:',time.time() - start_time) 19 return func 20 #bot.file_helper.send('获取车型链接类方法运行时间为:',time.time() - start_time) 21 return wrap 22 23 24 def try_expect(funcs): 25 def getwrap(*arg,**kwargs): 26 try: 27 funcs(*arg,**kwargs) 28 print('时间:',datetime.now(),'-----------未发生异常------------') 29 #bot.file_helper.send('时间:',datetime.now(),'未发生异常!!!!!!!!!!!!!!') 30 except Exception: 31 return '-----------出错啦!!!!!!!!!!' 32 #return funcs 33 return getwrap 34 35 class Yi_car_data(object): 36 37 headers = {'Referer':'http://i.yiche.com/authenservice/login.html?returnurl=http%3A%2F%2Fguangzhou.bitauto.com%2F%3Freferrer%3Dhttp%3A%2F%2Fi.yiche.com%2Fauthenservice%2FAboutPassWord%2FResetPasswordResult.aspx%3Freturnurl%3Dhttp%253a%252f%252fi.yiche.com%252fu27686084%252f'} 38 login_url = 'http://i.yiche.com/ajax/Authenservice/login.ashx' 39 data ={ 40 'txt_LoginName':'15766264244', 41 'txt_Password':'123456789aa', 42 'txt_Code':'', 43 'cbx_keepState':'true', 44 'returnurl':'http://guangzhou.bitauto.com/?referrer=http://i.yiche.com/authenservice/AboutPassWord/ResetPasswordResult.aspx?returnurl=http://i.yiche.com/u27686084/', 45 'guid':'', 46 'Gamut':'true' 47 48 } 49 cartype_url = 'http://api.car.bitauto.com/CarInfo/getlefttreejson.ashx?tagtype=chexing&pagetype=masterbrand&objid=0' 50 51 def __init__(self): 52 #client = redis.Redis(host='127.0.0.1',port=int(6379)) 53 self.pool = redis.ConnectionPool(host='127.0.0.1', port=6379) 54 self.r = redis.Redis(connection_pool=self.pool) 55 #self.bot = Bot(cache_path=True) 56 self.s_requests = requests.Session() 57 html = self.s_requests.post(self.login_url,data=self.data,headers = self.headers,timeout=2) 58 print(html.status_code) 59 60 61 def Car_types_all_datas(self): 62 63 global reponse 64 self.headers['User-Agent'] = random.choice(user_agent.user_agent_list) 65 try: 66 while True: 67 time.sleep(1) 68 reponse = self.s_requests.get(self.cartype_url,headers=self.headers,timeout=3) 69 #print(reponse.text[132:]) 70 if reponse.status_code ==200 or '奥迪' in reponse.content: 71 break 72 else: 73 print('downing field') 74 except Exception as e: 75 print('------------抛出异常----------------:',e) 76 cartypeurl_re_patten = 'url:"(.*?)"' 77 cartypename_re_patten = 'name:"(.*?)"' 78 patten_cartypeurl = re.compile(cartypeurl_re_patten) 79 patten_cartypename = re.compile(cartypename_re_patten) 80 cartype_url = patten_cartypeurl.findall(str(reponse.text)) 81 cartype_name = patten_cartypename.findall(str(reponse.text)) 82 if len(cartype_url)==0: 83 if len(cartype_name)==0: 84 print('------------------------解析数据为空---------------------') 85 86 else: 87 cartype_datas = list(zip(cartype_url,cartype_name)) 88 #print(type(str(self.r.get('car_url_name')))) 89 try: 90 if str(cartype_datas) in str(self.r.get('car_url_name')): 91 print('---------------数据有重复-------------------------') 92 else: 93 self.r.set('car_url_name',str(cartype_datas)) 94 except Exception as e: 95 print(e) 96 print('----------------------------插入redis失败--------------------') 97 #print(list(zip(cartype_url,cartype_name))) 98 return cartype_datas 99 100 @try_expect #捕获运行异常 101 @run_time #计算方法运行时间 102 def car_type_all_info_time_expect(self): 103 self.Car_types_all_datas() 104 105 106 def car_type_all_info(self): 107 car_type_url = [] 108 for url_name in self.Car_types_all_datas(): 109 #print(url_name) 110 car_type_url.append('http://car.bitauto.com/'+str(url_name[0])) 111 #print(car_type_url) 112 return car_type_url#返回全部车品牌链接数据 113 114 115 def get_car_info(self,url): 116 global req 117 self.headers['User-Agent'] = random.choice(user_agent.user_agent_list) 118 try: 119 req = self.s_requests.get(url,headers=self.headers,timeout=5) 120 #print(req.text) 121 except: 122 print('--------------------出错啦---------------------------!!!!!!!!!!') 123 select = etree.HTML(str(req.text)) 124 car_url =['http://car.bitauto.com/'+str(i) for i in select.xpath('//li[@class="name"]/a/@href')] 125 car_name = select.xpath('//li[@class="name"]/a/@title') 126 127 if len(car_url)==0 or len(car_name)==0: 128 print('----------------提取数据失败-----------------------') 129 elif len(car_url)!=0 or len(car_name)!=0: 130 #print(list(zip(car_url,car_name))) 131 return list(zip(car_url, car_name))#返回每款车品牌的车系链接和名字 132 else: 133 print('#######################################') 134 135 def get_all_cartypes_info(self,infourl): 136 137 data ={} 138 global reqs 139 self.headers['User-Agent'] = random.choice(user_agent.user_agent_list) 140 try: 141 time.sleep(1) 142 reqs = self.s_requests.get(infourl, headers=self.headers, timeout=5) 143 print(req.status_code) 144 except Exception as e: 145 print('------------出错啦-----------',e) 146 carxpath = etree.HTML(str(reqs.text)) 147 148 car_type = carxpath.xpath('//a[@class="txt"]/text()')#车型 149 car_price = carxpath.xpath('//span[@class="price"]/text()')#车价 150 #a =carxpath.xpath('//a[@class="lnk-bzl"]/text()|//a[@class="data"]/text()') 151 car_pailiang = '<span class="data" title="(.*?)">(.*?)</span>'#排量 152 car_zidong = '<span class="data">(.*?)</span>'#变速箱 153 #car_baozhilv = '<a class="lnk-bzl" href="/zhongxingche/baozhilv/" target="_blank" data-channelid="2.21.2032" data_cyslogclickflag="2.21.2032" onclick="BglogPostLog('2.21.2032',this);">(.*?) </a>'#保值率 154 #car_youhao = '<a class="data" data-channelid="2.21.855" target="_blank" href="http://car.bitauto.com/quanxinaodia4l/youhao/(.*?)">(.*?) </a>'#油耗 155 pailiang = re.compile(car_pailiang).findall(str(reqs.text),re.X)[0][1] 156 biansuxiang = re.compile(car_zidong).findall(str(reqs.text))[0] 157 car_baozhilv = carxpath.xpath('//a[@class="lnk-bzl"]/text()')[0][:6] 158 car_youhao = carxpath.xpath('//a[@class="data"]/text()')[0][:8] 159 data['车型'] = car_type 160 data['车价'] = car_price 161 data['排量'] = pailiang 162 data['变速箱'] = biansuxiang 163 data['五年保值率'] = car_baozhilv 164 data['油耗'] = car_youhao 165 print(data) 166 #yield data#存储到mongodb 167 168 169 def all_run_main(): 170 171 global all_cartype_datas 172 count = 0 173 tn_car = Yi_car_data() 174 # t.Car_types_all_datas() 175 tn_car.car_type_all_info_time_expect() 176 dn_car = tn_car.car_type_all_info() 177 # print(d) 178 for j in dn_car: 179 count += 1 180 #print('正在解析第'+str(j),' ',count) 181 try: 182 all_cartype_datas = tn_car.get_car_info(j) 183 except Exception as e: 184 print(e) 185 #print(f) 186 yield all_cartype_datas 187 188 189 def threading_run_main(): 190 tn_car = Yi_car_data() 191 main = tn_car.get_all_cartypes_info 192 car_urls = [] 193 for jk in list(all_run_main())[0]: 194 car_urls.append(jk[0]) 195 print('链接数量:',len(car_urls)) 196 pool = ThreadPoolExecutor(100)#----线程数----- 197 f = pool.map(main,car_urls) 198 199 200 201 if __name__ =="__main__": 202 threading_run_main() 203 import pymongo 204 conn = pymongo.MongoClient('localhost', 27017) #连接服务器 205 db = conn.fangtianxia 206 items = db.items 207 items.insert('') 208 209 210 211 ''' 212 #tn_car = Yi_car_data() 213 #tn_car.get_all_cartypes_info('http://car.bitauto.com/aodiq3haiwai/') 214 #print(list(all_run_main())) 215 #print(list(all_run_main())) 216 '''