zoukankan      html  css  js  c++  java
  • python 编码问题

    __author__ = 'dell'
    # -*- coding: utf-8 -*-
    
    from lxml import etree
    import urllib2
    import time
    
    
    def loadCategory():
        res = {}
        f_txt = open('catetory.txt')
        while True:
            line = f_txt.readline()
            if not line:
                break
            line = line.strip().decode('gbk')
            tokens = line.split('	')
            if len(tokens) < 2:
                continue
            key = tokens[1].strip()
            print key
            val = tokens[0].strip()
            res[key] = val
        return res
    
    
    def loadCity():
        res = {}
        f_txt = open('city.txt')
        while True:
            line = f_txt.readline()
            if not line:
                break
            line = line.strip().decode('gbk')
            tokens = line.split(':')
            if len(tokens) < 2:
                continue
            key = tokens[0].strip()
            val = tokens[1].strip()
            if key in res.keys():
                print 'repeated city:', key
            else:
                res[key] = val
        return res
    
    
    cats = loadCategory()
    # for key in cats.keys():
    #     print key, cats[key]
    
    citys = loadCity()
    # for key in citys.keys():
    #     print key, citys[key]
    
    print 'length of category:', len(cats)
    print 'length of citys:', len(citys)
    
    print 'generating urls ... ...'
    
    standard = 'http://www.dianping.com/search/category/%s/%s'
    
    
    def gen(cateName):
        res = []
        if cateName in cats.keys():
            catId = cats[cateName]
            for cityName in citys.keys():
                cityId = citys[cityName]
                url = standard % (cityId, catId)
                res.append((url, cityName))
            return res
        else:
            return res
    
    
    def getHtml(url):
        request = urllib2.Request(url)
        request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0')
        doc = urllib2.urlopen(request, timeout=45).read().decode('utf8')
        return doc
    
    
    def getFetchHour(count):
        return count * 5.0 / 3600
    
    
    def getFetchDay(count):
        return (count * 5.0 / 3600) / 24
    
    
    urllist = gen(u'购物')
    print len(urllist)
    sum = 0
    for u in urllist:
        html = getHtml(u[0])
        tree = etree.HTML(html)
        hnc = tree.xpath("//span[@class='Color7']")
        for hn in hnc:
            strnum = hn.text.replace('(', '').replace(')', '')
            print u[1], strnum
            sum += int(strnum)
        # time.sleep(5)
    
    print sum
    print 'fetch time (hour) :' + str(getFetchHour(sum))
    print 'fetch time (day) :' + str(getFetchDay(sum))
  • 相关阅读:
    TableEx 控件 v1.0 [原创][免费][开源]
    js刷新页面
    SimpleAjax 开发包 v3.1 (简单的Ajax)
    oracle中的''空字符串和null居然是等价的
    HTTP 错误大全
    Ext2.0 form使用实例
    isqlweb (Web版 SQL Server 管理器)
    关于软件版本
    我的第一个C++程序——方块游戏 v1.0
    轻松实现UltraWebGrid中的分页控制
  • 原文地址:https://www.cnblogs.com/i80386/p/3421468.html
Copyright © 2011-2022 走看看