zoukankan      html  css  js  c++  java
  • python爬虫爬取人人车(二手车)、利用padas、matplotlib生成图表,将信息打成csv格式

    该程序主要为了抓取人人车卖车信息,包括车系、车型号、购车日期、卖车价格、行驶路程、首付价格等等信息。话不多说直接代码。

    入库之后将Mongodb里的信息导出成Excel语句

    mongoexport -d myDB -c user -f _id,name,password,adress --csv -o ./user.csv

    -d 标示 数据库
    -c 标示 数据表
    -f 需要提取的field用逗号分隔
    -o 输出路径

    车系py文件

    # -*- coding: utf-8 -*-
    import re
    from urllib.request import urlopen
    from scrapy.http import Request
    # from urllib.request import Request
    from bs4 import BeautifulSoup
    from lxml import etree
    import pymongo
    import scrapy
    from scrapy.selector import HtmlXPathSelector
    client = pymongo.MongoClient(host="127.0.0.1")
    db = client.renrenche
    collection = db.Carclass          #表名classification
    
    
    import redis        #导入redis数据库
    r = redis.Redis(host='127.0.0.1', port=6379, db=0)
    
    class renrencheSpider(scrapy.Spider):
        name = "Carinfo1"
        allowed_domains = ["renrenche.com"]   #允许访问的域
        start_urls = [
            "https://www.renrenche.com/bj/ershouche/"
        ]
    
        #每爬完一个网页会回调parse方法
        def parse(self, response):
            hxs = HtmlXPathSelector(response)
            hx = hxs.select('//div[@class="brand-more-content"]/div[@class="brand-section brand-section-1"]/p[@class="bl"]/span[@class="bn"]/a')
            for secItem in hx:
                url = secItem.select("@href").extract()
                c = "https://www.renrenche.com"+url[0]
                name = secItem.select("text()").extract()
                classid =self.insertMongo(name,None)
                print(c)
                print(name)
                request = Request(c,callback=lambda response,pid=str(classid):self.parse_subclass(response,pid))
                yield request
        def parse_subclass(self, response,pid):
            # print(response.body.decode('utf-8'))
            hxs = HtmlXPathSelector(response)
            hx = hxs.select('//ul[@id="filter_series"]/li[@class=""]/a')
            for secItem in hx:
                urls = secItem.select("@href").extract()
                url = "https://www.renrenche.com" + urls[0]
                name = secItem.select("text()").extract()
                print(url)
                print(name)
                classid = self.insertMongo(name,pid)
                self.pushRedis(classid,url,pid)
    
        def insertMongo(self,classname,pid):
            classid = collection.insert({'classname':classname,'pid':pid})
            return classid
        def pushRedis(self,classid,url,pid,):
            carurl = '%s,%s,%s' %(classid,url,pid)
            r.lpush('carurl',carurl)
    

      卖车各种信息py文件

    # -*- coding: utf-8 -*-
    import re
    from urllib.request import urlopen
    from scrapy.http import Request
    import pymongo
    import scrapy
    from time import sleep
    from scrapy.selector import HtmlXPathSelector

    client = pymongo.MongoClient(host="127.0.0.1")
    db = client.renrenche
    collection = db.Carinfo

    import redis # 导入redis数据库

    r = redis.Redis(host='127.0.0.1', port=6379, db=0)




    class renrencheSpider(scrapy.Spider):
    name = "Carinfo2"
    allowed_domains = ["renrenche.com"]
    dict = {}
    start_urls = []

    def __init__(self): # 定义一个方法

    a = r.lrange('carurl', 0, -1)
    for item in a:
    novelurl = bytes.decode(item)
    arr = novelurl.split(',') # 分割字符串
    renrencheSpider.start_urls.append(arr[1])
    pid = arr[0]
    url = arr[1]
    self.dict[url] = {"pid":pid,"num":0}


    def parse(self, response):

    classInfo = self.dict[response.url]
    pid = classInfo['pid']
    num = classInfo['num']
    # print(self.dict)
    if num>3:
    return None
    hxs = HtmlXPathSelector(response)
    hx = hxs.select('//ul[@class="row-fluid list-row js-car-list"]')
    s=""
    for secItem in hx:
    hx1 = secItem.select('//li[@class="span6 list-item car-item"]/a[@rrc-event-param="search"]/h3')
    name = hx1.select("text()").extract()
    a = "型号:"+name[0]
    # self.insertMongo(classname=a)
    s +=a+" "
    # classid = collection.insert({'carinfo': a, 'pid': pid})
    # print(a)
    for secItem in hx:
    hx2 = secItem.select('//div[@class="mileage"]/span[@class="basic"]')
    name = hx2.select("text()").extract()
    b = "购车年份/公里数:"+name[0]+"/"+name[1]
    # self.insertMongo(classname1=b)
    s +=b+" "
    # print(b)
    for secItem in hx:
    hx3 = secItem.select('//div[@class="tags-box"]/div[@class="price"]')
    name = hx3.select("text()").extract()
    c = str(name[0])
    c = c.strip()

    c = "卖车价格:"+c+"万"
    # self.insertMongo(classname2=c)
    s +=c+" "
    # print(c)
    for secItem in hx:
    hx4 = secItem.select('//div[@class="down-payment"]/div[@class="m-l"]')
    name = hx4.select("text()").extract()
    d = "首付:"+name[0]+"万"
    # self.insertMongo(classname3=d,pid=pid)
    s +=d+" "
    # print(d)
    # print(s)

    arr = s.split(' ')
    print(arr[0])
    classid = self.insertMongo(arr[0],arr[1],arr[2],arr[3],pid)
    # classid = self.insertMongo(s, pid)


    def insertMongo(self, classname,classname1,classname2,classname3, pid):
    classid = collection.insert({'classname': classname,'classname1':classname1,'classname2':classname2,'classname3':classname3, 'pid': pid})
    return classid
    # r.lpush('novelnameurl', novelnameurl)

      

  • 相关阅读:
    站立会议08
    站立会议07
    站立会议06
    站立会议05
    SOA架构设计的案例分析
    java实现根据高德地图API接口进行地址位置解析,将地址转化为经纬度
    java实现根据起点终点和日期查询去哪儿网的火车车次和火车站点信息
    Cocos2d切换场景出现的问题-error C2653: “***”不是类或命名空间名称
    云时代架构之点融支付系统架构的演进
    云时代架构之百度万人协同规模下的代码管理架构演进
  • 原文地址:https://www.cnblogs.com/wangyuhangboke/p/8046115.html
Copyright © 2011-2022 走看看