zoukankan      html  css  js  c++  java
  • Python——爬取瓜子二手车

    # coding:utf8
    # author:Jery
    # datetime:2019/5/1 5:16
    # software:PyCharm
    # function:爬取瓜子二手车
    import requests
    from lxml import etree
    import re
    import csv
    
    start_url = 'https://www.guazi.com/www/buy/o1c-1'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
        'Cookie': 'uuid=6032a689-d79a-4060-c8b0-f57d8db4e245; antipas=16I7A500578955101K231sG39E; clueSourceCode=10103000312%2300; user_city_id=49; ganji_uuid=3434204287155953305008; sessionid=405f3fb6-fb90-409d-c7ed-32874a157920; lg=1; cainfo=%7B%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22tbmkbturl%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22scode%22%3A%2210103000312%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22ca_i%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_a%22%3A%22-%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%226032a689-d79a-4060-c8b0-f57d8db4e245%22%2C%22sessionid%22%3A%22405f3fb6-fb90-409d-c7ed-32874a157920%22%7D; cityDomain=mianyang; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A20570070983%7D; preTime=%7B%22last%22%3A1556660763%2C%22this%22%3A1556659891%2C%22pre%22%3A1556659891%7D'
    }
    
    
    # 获取详情页面url
    def get_detail_urls(url):
        response = requests.get(url, headers=headers)
        text = response.content.decode('utf-8')
        html = etree.HTML(text)
        index = html.xpath('//ul[@class="pageLink clearfix"]/li[@class="link-on"]/a/span/text()')
        next_url = html.xpath('//ul[@class="pageLink clearfix"]/li/a/@href')[-1]
        ul = html.xpath('//ul[@class="carlist clearfix js-top"]')[0]
        lis = ul.xpath('./li')
        urls = []
        for li in lis:
            detail_url = li.xpath('./a/@href')
            detail_url = 'https://www.guazi.com' + detail_url[0]
            urls.append(detail_url)
        return urls, index, next_url
    
    
    def get_info(url):
        response = requests.get(url, headers=headers)
        text = response.content.decode('utf-8')
        html = etree.HTML(text)
        infos_dict = {}
        city = html.xpath('//p[@class="city-curr"]/text()')[0]
        city = re.search(r'[u4e00-u9fa5]+', city).group(0)
        infos_dict['city'] = city
        title = html.xpath('//div[@class="product-textbox"]/h2/text()')[0]
        infos_dict['title'] = title.replace(r'
    ', '').strip()
        infos = html.xpath('//div[@class="product-textbox"]/ul/li/span/text()')
        infos_dict['cardtime'] = infos[0]
        infos_dict['kms'] = infos[1]
        if len(infos) == 4:
            infos_dict['cardplace'] = ''
            infos_dict['displacement'] = infos[2]
            infos_dict['speedbox'] = infos[3]
        else:
            infos_dict['cardplace'] = infos[2]
            infos_dict['displacement'] = infos[3]
            infos_dict['speedbox'] = infos[4]
    
        price = html.xpath('//div[@class="product-textbox"]/div/span[@class="pricestype"]/text()')[0]
        infos_dict['price'] = re.search(r'd+.?d+', price).group(0)
        return infos_dict
    
    
    def main():
        with open(r"C:UsersJeryDesktopguazi.csv", 'w', newline='') as f:
            csvwriter_head = csv.writer(f, dialect='excel')
            csvwriter_head.writerow(['城市', '车型', '上牌时间', '上牌地', '表显里程', '排量', '变速箱', '价格'])
        while True:
            global start_url
            urls, index, next_url = get_detail_urls(start_url)
            print("当前页码:{}*****************".format(index))
            # 写表头
            with open(r'C:UsersJeryDesktopguazi.csv', 'a') as f:
                for url in urls:
                    print("正在爬取:{}".format(url))
                    infos = get_info(url)
                    print(infos)
                    csvwriter = csv.writer(f, dialect='excel')
                    csvwriter.writerow(
                        [infos['city'], infos['title'], infos['cardtime'], infos['cardplace'], infos['kms'],
                         infos['displacement'],
                         infos['speedbox'],
                         infos['price']])
            if next_url:
                start_url = 'https://www.guazi.com' + next_url
    
    
    if __name__ == '__main__':
        main()

    后续将进行数据分析

  • 相关阅读:
    字符串的问题(strstr和strncpy 水题)
    数一数(KMP+思维)
    栗酱的数列(KMP+思维)
    D. Almost All Divisors(思维)
    E. Two Arrays and Sum of Functions(贪心)
    好位置(思维)
    Just A String(kmp)
    Dubbo简介
    lambda表达式快速创建
    分布式RPC系统框架Dubbo
  • 原文地址:https://www.cnblogs.com/Jery-9527/p/10799083.html
Copyright © 2011-2022 走看看