zoukankan      html  css  js  c++  java
  • 【python】字符串编码问题

    参考:http://blog.csdn.net/tingsking18/article/details/4033645

    python内部的字符串是以unicode来编码

    decode函数用来将其他编码解码为unicode

    encode函数将unicode编码为指定的编码类型,例如gbk,utf-8

    # -*- coding: utf-8 -*-
    """
    Created on Wed Jan 15 15:20:59 2014
    
    @author: hp
    """
    
    
    import urllib2
    import re
    import time
    import jieba
    
    
    url="http://blog.sina.com.cn/s/blog_608e1afd0102e5ym.html"
    def geturl(url):
        html=urllib2.urlopen(url).read()
        html=unicode(html,'utf-8')
        word=re.findall(ur"[u4e00-u9fa5]+",html)
        
        s=""
        for w in word:
            s+=w
        return s  #return web content
    def separate_word(s):    
        seg_list=jieba.cut(s,cut_all=False)
        fenci="/ ".join(seg_list)
    #    print 'get web-->',s
    #    print 'div result-》',fenci
    #    print "fenci[1]-->",fenci[1]
        word_list=[]
        word_tmp=""
        #word_tmp.decode('utf-8')
        for i in range(len(fenci)):    
            if fenci[i]!="/":
                word_tmp+=fenci[i]
            else:
                i+=1
                word_tmp.decode('utf-8','ignore')
                word_list.append(word_tmp)
                word_tmp=""
        #word_list=seg_list.split("/ ")
        
    #    print "word_list-->",word_list
        return word_list
        
    def count_word(word_list):
        word_list_group=[]
        word_num=[]
        dic={}
        for i in range(len(word_list)):
            w_tmp=word_list[i]
            signal=0
            for j in range(len(word_list_group)):
                if word_list_group[j]==w_tmp:
                    signal=1
            if signal==0:
                word_list_group.append(unicode(w_tmp.encode('utf-8'),'utf-8'))
                
        for i in range(len(word_list_group)):
            num=0
            for j in range(len(word_list)):
                if word_list_group[i]==word_list[j]:
                    num+=1
            word_num.append(num)
        
        for i in range(len(word_list_group)):
            dic[word_list_group[i].encode('gbk')]=word_num[i]
        
    #    for i in range(len(word_list_group)):
    #        print "word_list_group-->",word_list_group[i].encode('gbk'),"word_num-->",word_num[i]
        return dic    
    #    return word_list_group,word_num
        
    contant=geturl(url)
    word=separate_word(contant)
    result=count_word(word)
    for key in result.keys():
        print key.encode('gbk'),"--->",result[key]
    #print result
    
    time.sleep(10)
  • 相关阅读:
    初识Kafka
    初学Linux (Linux_note)
    Zookeeper_ACL
    Zookeeper简单配置
    getWindow().setFlags
    设置session失效时间
    Android开发者必知的5个开源库
    oracle A用户访问B用户的表aa
    oracle 根据约束名查表名
    java ftp上载下传 遇到的问题
  • 原文地址:https://www.cnblogs.com/colipso/p/3522821.html
Copyright © 2011-2022 走看看