zoukankan      html  css  js  c++  java
  • 爬取汽车之家车型配置信息

    一、需求

    获取指定品牌的所有车型配置信息,并保存到excel中。

    流程大致思路:

    1.获取品牌id:brand_id

    2.通过品牌id获取车型id:series_id

    3.获取车型配置页面

    4.解析配置页面内容(这步最复杂,使用了之前一些大神的代码)

    二、代码

    测试完美运行

    import requests
    import json
    import xlwt
    from bs4 import BeautifulSoup
    import re
    from urllib import parse
    from selenium import webdriver
    
    
    class Car_home_config(object):
        def __init__(self):
            self.session = requests.Session()
            self.params = None
            self.brand_dict = {}
            self.series_dict = {}
            self.brand_name = None
    
        def get_header(self):
            self.headers = {
                "authority": "car.autohome.com.cn",
                "method": "GET",
                "path": "/AsLeftMenu/As_LeftListNew.ashx?%s" % parse.urlencode(self.params),
                "scheme": "https",
                "accept": "*/*",
                "accept-encoding": "gzip, deflate, br",
                "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
                "cache-control": "no-cache",
                "pragma": "no-cache",
                "sec-ch-ua": "Google Chrome;v=87,Not;A Brand;v=99,Chromium;v=87",
                "sec-ch-ua-mobile": "?0",
                "sec-fetch-dest": "document",
                "sec-fetch-mode": "navigate",
                "sec-fetch-site": "none",
                "sec-fetch-user": "?1",
                "upgrade-insecure-requests": "1",
                "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
            }
    
        # 获取所有品牌id号
        def get_brand_id(self):
            self.params = {
                "typeId": "1",
                "brandId": "0",
                "fctId": "0",
                "seriesId": "0"
            }
            self.get_header()
            url = r"https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx"
            res = self.session.get(url=url, headers=self.headers, params=self.params)
            res.encoding = res.apparent_encoding
            html = res.text
            # print(html)
            soup = BeautifulSoup(html, 'lxml')
            ul_list = soup.find_all("ul")
            for ul in ul_list:
                li_list = ul.find_all("li")
                for li in li_list:
                    a_href = li.find("a").attrs.get('href')
                    a_text = li.find("a").text
                    # print(a_href)
                    # print(a_text)
                    brand_id = re.findall("[0-9]d*", a_href)[0]
                    self.brand_dict[brand_id] = a_text
            return self.brand_dict
    
        def get_AsLeftMenu(self):
            url = r"https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx"
            res = self.session.get(url=url, headers=self.headers, params=self.params)
            res.encoding = res.apparent_encoding
            html = res.text
            soup = BeautifulSoup(html, 'lxml')
            dd_list = soup.find_all("dd")
            for dd in dd_list:
                a_list = dd.find_all("a")
                for a in a_list:
                    a_href = a.attrs.get('href')
                    a_text = a.text
                    print(a_href)
                    print(a_text)
                    series_id = re.findall("[0-9]d*", a_href)[0]
                    self.series_dict[series_id] = a_text
    
        # 获取某一品牌下车型的id号
        def get_series_id(self):
            self.get_brand_id()
            if self.brand_name:
                for k, v in self.brand_dict.items():
                    if self.brand_name in v:
                        self.params = {
                            "typeId": "1",
                            "brandId": k,
                            "fctId": "0",
                            "seriesId": "0"
                        }
                        self.get_header()
                        self.get_AsLeftMenu()
                        return self.series_dict
            else:
                for k, v in self.brand_dict.items():
                    self.params = {
                        "typeId": "1",
                        "brandId": k,
                        "fctId": "0",
                        "seriesId": "0"
                    }
                    self.get_header()
                    self.get_AsLeftMenu()
                return self.series_dict
    
        # 获取车型配置信息
        def get_config_content(self, series_id):
            res = self.session.get(r"https://car.autohome.com.cn/config/series/{}.html".format(series_id), verify=False,
                               headers={
                                   "authority": "car.autohome.com.cn",
                                   "method": "GET",
                                   "path": "/config/series/{}.html".format(series_id),
                                   "scheme": "https",
                                   "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
                                   "accept-encoding": "gzip, deflate, br",
                                   "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
                                   "cache-control": "no-cache",
                                   "referer": "https://www.autohome.com.cn/",
                                   "sec-ch-ua": "Google Chrome;v=87,Not;A Brand;v=99,Chromium;v=87",
                                   "sec-ch-ua-mobile": "?0",
                                   "sec-fetch-dest": "document",
                                   "sec-fetch-mode": "navigate",
                                   "sec-fetch-site": "same-site",
                                   "ec-fetch-user": "?1",
                                   "upgrade-insecure-requests": "1",
                                   "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"})
            html = res.content.decode("utf-8")
            return html
    
        def car_info(self, html):
            config = re.search("var config = (.*?)};", html)  # 车的参数
            option = re.search("var option = (.*?)};", html)  # 主被动安全装备
            bag = re.search("var bag = (.*?)};", html)  # 选装包
            # 处理汽车参数
            car_info = ""
            if config and option and bag:
                car_info = car_info + config.group(0) + option.group(0) + bag.group(0)
            return car_info
    
        def write_html(self, js_list, car_info):
            # 运行JS的DOM -- 这部破解是最麻烦的,非常耗时间~参考了互联网上的大神代码
            DOM = ("var rules = '2';"
                   "var document = {};"
                   "function getRules(){return rules}"
                   "document.createElement = function() {"
                   "      return {"
                   "              sheet: {"
                   "                      insertRule: function(rule, i) {"
                   "                              if (rules.length == 0) {"
                   "                                      rules = rule;"
                   "                              } else {"
                   "                                      rules = rules + '#' + rule;"
                   "                              }"
                   "                      }"
                   "              }"
                   "      }"
                   "};"
                   "document.querySelectorAll = function() {"
                   "      return {};"           "};"
                   "document.head = {};"
                   "document.head.appendChild = function() {};"
    
                   "var window = {};"
                   "window.decodeURIComponent = decodeURIComponent;")
    
            # 把JS文件写入到文件中去
            for item in js_list:
                DOM = DOM + item
            html_type = "<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8' /><head></head><body>    <script type='text/javascript'>"
            # 拼接成一个可以运行的网页
            js = html_type + DOM + " document.write(rules)</script></body></html>"
            # 再次运行的时候,请把文件删除,否则无法创建同名文件,或者自行加验证即可
            with open("original.html", "w", encoding="utf-8") as f:
                f.write(js)
            try:
                driver = webdriver.PhantomJS(
                    executable_path=r"phantomjs.exe")
                driver.get("original.html")
                # 读取body部分
                text = driver.find_element_by_tag_name('body').text
                if not text:
                    return
            except Exception as e:
                print(e)
            finally:
                driver.close()
            # 匹配车辆参数中所有的span标签
            span_list = re.findall("<span(.*?)></span>", car_info)  # car_info 是我上面拼接的字符串
            # 按照span标签与text中的关键字进行替换
            for span in span_list:
                # 这个地方匹配的是class的名称  例如 <span class='hs_kw7_optionZl'></span> 匹配   hs_kw7_optionZl 出来
                info = re.search("'(.*?)'", span)
                if info:
                    class_info = str(info.group(
                        1)) + "::before { content:(.*?)}"  # 拼接为  hs_kw7_optionZl::before { content:(.*?)}
                    content = re.search(class_info, text).group(1)  # 匹配文字内容,返回结果为 "实测""油耗""质保"
                    car_info = car_info.replace(str("<span class='" + info.group(1) + "'></span>"),
                                                re.search(""(.*?)"", content).group(1))
            return car_info
    
        def save(self, car_info, car_name, save_path):
            # 持久化
            car_item = {}
            config = re.search("var config = (.*?);", car_info).group(1)
            option = re.search("var option = (.*?);var", car_info).group(1)
            bag = re.search("var bag = (.*?);", car_info).group(1)
            config_re = json.loads(config)
            option_re = json.loads(option)
            bag_re = json.loads(bag)
            config_item =[]
            option_item = []
            for i in config_re['result']['paramtypeitems']:
                config_item+=i['paramitems']
            for i in option_re['result']['configtypeitems']:
                option_item+=i['configitems']
            # bag_item = bag_re['result']['bagtypeitems'][0]['bagitems']
            for car in config_item:
                car_item[car['name']] = []
                for value in car['valueitems']:
                    car_item[car['name']].append(value['value'])
            for car in option_item:
                car_item[car['name']] = []
                for value in car['valueitems']:
                    car_item[car['name']].append(value['value'])
            # for car in bag_item[0]['valueitems']:
            #     car_item[car['name']] = []
            #     car_item[car['name']].append(car['bagid'])
            #     car_item[car['name']].append(car['pricedesc'])
            #     car_item[car['name']].append(car['description'])
            # 生成表格
            workbook = xlwt.Workbook(encoding='ascii')  # 创建一个文件
            worksheet = workbook.add_sheet('汽车之家')  # 创建一个表
            cols = 0
            start_row = 0
            for co in car_item:
                worksheet.write(start_row, cols, co)  # 在第0(一)行写入车的配置信息
                cols = cols + 1
            end_row_num = start_row + len(car_item['车型名称'])  # 车辆款式记录数
            for row in range(start_row, end_row_num):
                col_num = 0  # 列数
                row += 1
                for col in car_item:
                    try:
                        con = str(car_item[col][row - 1])
                    except:
                        con = ""
                    worksheet.write(row, col_num, con)
                    col_num = col_num + 1
            workbook.save('{}/{}.xls'.format(save_path, car_name))
    
        # 查找车型配置,brand_name不填就是查找所有
        def check(self, brand_name, save_path="./"):
            self.brand_name = brand_name
            self.get_series_id()
            for series_id, car_name in self.series_dict.items():
                print(series_id, car_name)
                html = self.get_config_content(series_id)
                car_info = self.car_info(html)
                js_list = re.findall('((function([a-zA-Z]{2}.*?_).*?(document);)', html)
                car_info = self.write_html(js_list, car_info)
                if car_info:
                    self.save(car_info, car_name, save_path)
    
    
    car = Car_home_config()
    car.check("奥迪")

    phantomjs.exe下载地址:https://phantomjs.org/download.html

    感谢以下作者:
    https://www.cnblogs.com/kangz/p/10011348.html
    https://www.cnblogs.com/pontoon/p/10459471.html

  • 相关阅读:
    Javascript 入门 document
    JavaScript 入门 (一)
    Python 之 Json序列化嵌套类
    [20171113]修改表结构删除列相关问题3.txt
    [20171113]修改表结构删除列相关问题2.txt
    [20171113]修改表结构删除列相关问题.txt
    [20171110]_allow_read_only_corruption参数.txt
    [20171107]dbms_shared_pool.pin补充.txt
    [20171107]dbms_shared_pool.pin.txt
    [20171106]修改show spparameter的显示宽度.txt
  • 原文地址:https://www.cnblogs.com/angelyan/p/14306705.html
Copyright © 2011-2022 走看看