zoukankan      html  css  js  c++  java
  • 爬取易车网所有车系车型数据

    下面就是是源代码,我是直接把数据保存在字典并且打印到控制台的

    
    
      1 # -*- coding: utf-8 -*-
      2 from lxml import etree
      3 import requests,re,random
      4 import time,redis
      5 from myweb.yichewang import user_agent
      6 import urllib.request
      7 from concurrent.futures import ThreadPoolExecutor
      8 
      9 import pymysql
     10 from datetime import datetime
     11 from wxpy import *
     12 #bot = Bot(cache_path=True)#微信端监控运行情况
     13 
     14 def run_time(func):
     15     def wrap(*arg,**kwargs):
     16         start_time = time.time()
     17         func(*arg,**kwargs)
     18         print('获取车型链接类方法运行时间为:',time.time() - start_time)
     19         return func
     20         #bot.file_helper.send('获取车型链接类方法运行时间为:',time.time() - start_time)
     21     return wrap
     22 
     23 
     24 def try_expect(funcs):
     25     def getwrap(*arg,**kwargs):
     26         try:
     27             funcs(*arg,**kwargs)
     28             print('时间:',datetime.now(),'-----------未发生异常------------')
     29             #bot.file_helper.send('时间:',datetime.now(),'未发生异常!!!!!!!!!!!!!!')
     30         except Exception:
     31             return '-----------出错啦!!!!!!!!!!'
     32         #return funcs
     33     return getwrap
     34 
     35 class Yi_car_data(object):
     36 
     37     headers = {'Referer':'http://i.yiche.com/authenservice/login.html?returnurl=http%3A%2F%2Fguangzhou.bitauto.com%2F%3Freferrer%3Dhttp%3A%2F%2Fi.yiche.com%2Fauthenservice%2FAboutPassWord%2FResetPasswordResult.aspx%3Freturnurl%3Dhttp%253a%252f%252fi.yiche.com%252fu27686084%252f'}
     38     login_url = 'http://i.yiche.com/ajax/Authenservice/login.ashx'
     39     data ={
     40         'txt_LoginName':'15766264244',
     41         'txt_Password':'123456789aa',
     42         'txt_Code':'',
     43         'cbx_keepState':'true',
     44         'returnurl':'http://guangzhou.bitauto.com/?referrer=http://i.yiche.com/authenservice/AboutPassWord/ResetPasswordResult.aspx?returnurl=http://i.yiche.com/u27686084/',
     45         'guid':'',
     46         'Gamut':'true'
     47 
     48     }
     49     cartype_url = 'http://api.car.bitauto.com/CarInfo/getlefttreejson.ashx?tagtype=chexing&pagetype=masterbrand&objid=0'
     50 
     51     def __init__(self):
     52         #client = redis.Redis(host='127.0.0.1',port=int(6379))
     53         self.pool = redis.ConnectionPool(host='127.0.0.1', port=6379)
     54         self.r = redis.Redis(connection_pool=self.pool)
     55         #self.bot = Bot(cache_path=True)
     56         self.s_requests = requests.Session()
     57         html = self.s_requests.post(self.login_url,data=self.data,headers = self.headers,timeout=2)
     58         print(html.status_code)
     59 
     60 
     61     def Car_types_all_datas(self):
     62 
     63         global reponse
     64         self.headers['User-Agent'] = random.choice(user_agent.user_agent_list)
     65         try:
     66             while True:
     67                 time.sleep(1)
     68                 reponse = self.s_requests.get(self.cartype_url,headers=self.headers,timeout=3)
     69                 #print(reponse.text[132:])
     70                 if reponse.status_code ==200 or '奥迪' in reponse.content:
     71                     break
     72                 else:
     73                     print('downing field')
     74         except Exception as e:
     75             print('------------抛出异常----------------:',e)
     76         cartypeurl_re_patten = 'url:"(.*?)"'
     77         cartypename_re_patten = 'name:"(.*?)"'
     78         patten_cartypeurl = re.compile(cartypeurl_re_patten)
     79         patten_cartypename = re.compile(cartypename_re_patten)
     80         cartype_url = patten_cartypeurl.findall(str(reponse.text))
     81         cartype_name = patten_cartypename.findall(str(reponse.text))
     82         if len(cartype_url)==0:
     83             if len(cartype_name)==0:
     84                 print('------------------------解析数据为空---------------------')
     85 
     86         else:
     87             cartype_datas = list(zip(cartype_url,cartype_name))
     88             #print(type(str(self.r.get('car_url_name'))))
     89             try:
     90                 if str(cartype_datas) in str(self.r.get('car_url_name')):
     91                     print('---------------数据有重复-------------------------')
     92                 else:
     93                     self.r.set('car_url_name',str(cartype_datas))
     94             except Exception as e:
     95                 print(e)
     96                 print('----------------------------插入redis失败--------------------')
     97             #print(list(zip(cartype_url,cartype_name)))
     98             return cartype_datas
     99 
    100     @try_expect #捕获运行异常
    101     @run_time #计算方法运行时间
    102     def car_type_all_info_time_expect(self):
    103         self.Car_types_all_datas()
    104 
    105 
    106     def car_type_all_info(self):
    107         car_type_url = []
    108         for url_name in self.Car_types_all_datas():
    109             #print(url_name)
    110             car_type_url.append('http://car.bitauto.com/'+str(url_name[0]))
    111         #print(car_type_url)
    112         return car_type_url#返回全部车品牌链接数据
    113 
    114 
    115     def get_car_info(self,url):
    116         global req
    117         self.headers['User-Agent'] = random.choice(user_agent.user_agent_list)
    118         try:
    119             req = self.s_requests.get(url,headers=self.headers,timeout=5)
    120             #print(req.text)
    121         except:
    122             print('--------------------出错啦---------------------------!!!!!!!!!!')
    123         select  = etree.HTML(str(req.text))
    124         car_url =['http://car.bitauto.com/'+str(i) for i in select.xpath('//li[@class="name"]/a/@href')]
    125         car_name = select.xpath('//li[@class="name"]/a/@title')
    126 
    127         if len(car_url)==0 or len(car_name)==0:
    128             print('----------------提取数据失败-----------------------')
    129         elif len(car_url)!=0 or len(car_name)!=0:
    130             #print(list(zip(car_url,car_name)))
    131             return list(zip(car_url, car_name))#返回每款车品牌的车系链接和名字
    132         else:
    133             print('#######################################')
    134 
    135     def get_all_cartypes_info(self,infourl):
    136 
    137         data ={}
    138         global reqs
    139         self.headers['User-Agent'] = random.choice(user_agent.user_agent_list)
    140         try:
    141             time.sleep(1)
    142             reqs = self.s_requests.get(infourl, headers=self.headers, timeout=5)
    143             print(req.status_code)
    144         except Exception as e:
    145             print('------------出错啦-----------',e)
    146         carxpath = etree.HTML(str(reqs.text))
    147 
    148         car_type = carxpath.xpath('//a[@class="txt"]/text()')#车型
    149         car_price = carxpath.xpath('//span[@class="price"]/text()')#车价
    150         #a =carxpath.xpath('//a[@class="lnk-bzl"]/text()|//a[@class="data"]/text()')
    151         car_pailiang = '<span class="data" title="(.*?)">(.*?)</span>'#排量
    152         car_zidong = '<span class="data">(.*?)</span>'#变速箱
    153         #car_baozhilv = '<a class="lnk-bzl" href="/zhongxingche/baozhilv/" target="_blank" data-channelid="2.21.2032" data_cyslogclickflag="2.21.2032" onclick="BglogPostLog('2.21.2032',this);">(.*?) </a>'#保值率
    154         #car_youhao = '<a class="data" data-channelid="2.21.855" target="_blank" href="http://car.bitauto.com/quanxinaodia4l/youhao/(.*?)">(.*?) </a>'#油耗
    155         pailiang = re.compile(car_pailiang).findall(str(reqs.text),re.X)[0][1]
    156         biansuxiang = re.compile(car_zidong).findall(str(reqs.text))[0]
    157         car_baozhilv = carxpath.xpath('//a[@class="lnk-bzl"]/text()')[0][:6]
    158         car_youhao = carxpath.xpath('//a[@class="data"]/text()')[0][:8]
    159         data['车型'] = car_type
    160         data['车价'] = car_price
    161         data['排量'] = pailiang
    162         data['变速箱'] = biansuxiang
    163         data['五年保值率'] = car_baozhilv
    164         data['油耗'] = car_youhao
    165         print(data)
    166         #yield data#存储到mongodb
    167 
    168 
    169 def all_run_main():
    170 
    171     global all_cartype_datas
    172     count = 0
    173     tn_car = Yi_car_data()
    174     # t.Car_types_all_datas()
    175     tn_car.car_type_all_info_time_expect()
    176     dn_car = tn_car.car_type_all_info()
    177     # print(d)
    178     for j in dn_car:
    179         count += 1
    180         #print('正在解析第'+str(j),'
    ',count)
    181         try:
    182             all_cartype_datas = tn_car.get_car_info(j)
    183         except Exception as e:
    184             print(e)
    185         #print(f)
    186         yield all_cartype_datas
    187 
    188 
    189 def threading_run_main():
    190     tn_car = Yi_car_data()
    191     main = tn_car.get_all_cartypes_info
    192     car_urls = []
    193     for jk in list(all_run_main())[0]:
    194         car_urls.append(jk[0])
    195     print('链接数量:',len(car_urls))
    196     pool = ThreadPoolExecutor(100)#----线程数-----
    197     f = pool.map(main,car_urls)
    198 
    199 
    200 
    201 if __name__ =="__main__":
    202     threading_run_main()
    203     import pymongo
    204     conn = pymongo.MongoClient('localhost', 27017)   #连接服务器
    205     db = conn.fangtianxia
    206     items = db.items
    207     items.insert('')
    208 
    209 
    210 
    211     '''
    212     #tn_car = Yi_car_data()
    213     #tn_car.get_all_cartypes_info('http://car.bitauto.com/aodiq3haiwai/')
    214     #print(list(all_run_main()))
    215     #print(list(all_run_main()))
    216     '''
    
    
    
     
  • 相关阅读:
    docker PXC MYSQL集群节点启动失败/节点顺序消失/只剩一个节点存在问题的解决
    springgateway
    rabbitMQ重复消费(结合死循环重发那一篇看)
    rabbitMq可靠性投递之手动ACK
    3表查询,1:多:多,根据1查多再查多
    tp后台注册登录配置项
    volist/foreach下,点击循环中的一个进行操作
    生成随机订单号
    省市县的下拉列表
    银行下拉列表
  • 原文地址:https://www.cnblogs.com/Huangsh2017Come-on/p/7904570.html
Copyright © 2011-2022 走看看