zoukankan      html  css  js  c++  java
  • spider autohome (1)

    Code:

    #!/usr/bin/python
    # -*- coding: UTF-8 -*-
    
    import re
    import urllib
    import time
    def getHtml(url):
        """ This function just simply get all the data
            by the url you get.and then decode and code
            to utf-8 which you need.
        """
        page = urllib.urlopen(url)
        html=page.read()
        uni_str = html.decode('gb2312')
        utf_str = uni_str.encode('utf-8')
        return utf_str
    
    def getInfo(html):
        """
            This function just simply get the data from the html
            and filter some data which we are interest,and then
            return a list.
        """
        reg = r'config = {(.+?)};'
        config_re = re.compile(reg)
        config_list = re.findall(config_re,html)
        return config_list
    
    def getEachCar(config_lists):
        """ This function will parse the data,and 
            then return a list include the all 
            information of each car,the each item
            of the car's information split by '|'.
        """
        each_car={}
        for sp in config_lists:
            config_str='{'+sp+'}'
            config_str=config_str.replace("null","None")
            regx=r'{"specid":d{5},"value":.+?}'
            cc=re.compile(regx)
            xx=re.findall(regx,config_str)
            for x in xx:
                x=eval(x)
                akey=repr(x['specid'])
                if each_car.has_key(akey):
                    each_car[akey]=each_car[akey]+x["value"]+"|"
                else:
                    each_car[akey]=x['value']
        jobs=[]
        for each in each_car:
            ter_data="|"+each_car[each]
            jobs.append(ter_data)
        return jobs
    if __name__ == '__main__':
    #   html = getHtml("http://car.autohome.com.cn/config/spec/21308.html#pvareaid=100679")
        html = getHtml("http://car.autohome.com.cn/config/spec/18239.html")
        config_lists=getInfo(html)
        each_car=getEachCar(config_lists)
        for acar in each_car:
            print acar

    Result:

    Can we drop this masquerade
  • 相关阅读:
    Django模型层进阶
    Django模型层相关
    BOM与DOM
    JavaScript入门
    HTML基础
    子查询/联合分组/all与any/视图/事务
    表的查询
    mysql的多表关系
    Mysql基本操作
    Spider_基础总结2_Requests异常
  • 原文地址:https://www.cnblogs.com/landpack/p/4555554.html
Copyright © 2011-2022 走看看