zoukankan html css js c++ java

python 编码问题

__author__ = 'dell'
# -*- coding: utf-8 -*-

from lxml import etree
import urllib2
import time


def loadCategory():
    res = {}
    f_txt = open('catetory.txt')
    while True:
        line = f_txt.readline()
        if not line:
            break
        line = line.strip().decode('gbk')
        tokens = line.split('	')
        if len(tokens) < 2:
            continue
        key = tokens[1].strip()
        print key
        val = tokens[0].strip()
        res[key] = val
    return res


def loadCity():
    res = {}
    f_txt = open('city.txt')
    while True:
        line = f_txt.readline()
        if not line:
            break
        line = line.strip().decode('gbk')
        tokens = line.split(':')
        if len(tokens) < 2:
            continue
        key = tokens[0].strip()
        val = tokens[1].strip()
        if key in res.keys():
            print 'repeated city:', key
        else:
            res[key] = val
    return res


cats = loadCategory()
# for key in cats.keys():
#     print key, cats[key]

citys = loadCity()
# for key in citys.keys():
#     print key, citys[key]

print 'length of category:', len(cats)
print 'length of citys:', len(citys)

print 'generating urls ... ...'

standard = 'http://www.dianping.com/search/category/%s/%s'


def gen(cateName):
    res = []
    if cateName in cats.keys():
        catId = cats[cateName]
        for cityName in citys.keys():
            cityId = citys[cityName]
            url = standard % (cityId, catId)
            res.append((url, cityName))
        return res
    else:
        return res


def getHtml(url):
    request = urllib2.Request(url)
    request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0')
    doc = urllib2.urlopen(request, timeout=45).read().decode('utf8')
    return doc


def getFetchHour(count):
    return count * 5.0 / 3600


def getFetchDay(count):
    return (count * 5.0 / 3600) / 24


urllist = gen(u'购物')
print len(urllist)
sum = 0
for u in urllist:
    html = getHtml(u[0])
    tree = etree.HTML(html)
    hnc = tree.xpath("//span[@class='Color7']")
    for hn in hnc:
        strnum = hn.text.replace('(', '').replace(')', '')
        print u[1], strnum
        sum += int(strnum)
    # time.sleep(5)

print sum
print 'fetch time (hour) :' + str(getFetchHour(sum))
print 'fetch time (day) :' + str(getFetchDay(sum))

查看全文

相关阅读:
实习第一天
 使用epublib解析epub文件（章节内容、书籍菜单）
jdk1.8以前不建议使用其自带的Base64来加解密
 java学习-AES加解密之AES-128-CBC算法
 java学习-sha1散列算法
 日、周、月活跃用户数，用户流失率
 java学习-java.lang.Math随机数生成
 AndroidStudio报错Software caused connection abort: recv failed
java学习-java.lang一Number类
 jdk内置类javax.imageio.ImageIO支持的图片处理格式

原文地址：https://www.cnblogs.com/i80386/p/3421468.html