zoukankan      html  css  js  c++  java
  • 阶段作业1:完整的中英文词频统计+补交上次作业

    #补交作业

    cc = ('''Counting stars Lately I've been, I've been losing sleep   
    Dreaming 'bout the things that we could be   
    But baby I've been, I've been prayin' hard     
    Said no more counting dollars   We'll be counting stars   
    Yeah, we'll be counting stars   I see this life Like a swinging vine  
     Swing my heart across the line   In my face is flashing signs   Seek it out and ye shall find
      Old, but I'm not that old   Young, but I'm not that bold   And I don't think the world is sold  
     I'm just doing what we're told   I, feel something so right   But doing the wrong thing   
    I, feel something so wrong   But doing the right thing   I could lie, could lie, could lie  
     everything that kills me makes me feel alive   Lately I've been, I've been losing sleep  
     Dreaming 'bout the things that we could be   Baby I've been, I've been prayin' hard  
     Said no more counting dollars   We'll be counting stars   Lately I've been, I've been losing sleep   
    Dreaming 'bout the things that we could be   Baby I've been, I've been prayin' hard   Said no more counting dollars  
     We'll be, we'll be counting stars   I feel the love And I feel it burn   Down this river every turn  
     Hope is a four letter word   Make that money   Watch it burn   Old, but I'm not that old  
     Young, but I'm not that bold   And I don't think the world is sold   I'm just doing what we're told  
     I, feel something so wrong   But doing the right thing   I could lie, could lie, could lie  
     Everything that drowns me makes me wanna fly   Lately I've been, I've been losing sleep  
     Dreaming 'bout the things that we could be   Baby I've been, I've been prayin' hard
      Said no more counting dollars   We'll be counting stars   Lately I've been, I've been losing sleep  
     Dreaming 'bout the things that we could be   Baby I've been, I've been prayin' hard  
     Said no more counting dollars   We'll be, we'll be counting stars   Take that money And watch it burn   Sink in the river
    ''')
    cc = cc.replace('.', ' ')
    ccList = cc.split()
    print(len(cc), ccList)  # 分隔一个单词并统计英文单词个数
    ccSet = set(ccList)  # 将列表转化成集合,再将集合转化成字典来统计每个单词出现个数
    
    print(ccSet)
    
    
    strDict = {}
    # for star in ccSet:
    #     strDict[star] = ccList.count(star)
    # print(strDict, len(strDict))
    for star in ccSet:
        strDict[star]=cc.count(star)
    for key in ccSet:
        print(key,strDict[key])
    wclist=list(ccSet.items())
    print(wclist)
    # def takeSecond(elem):
    #     return  elem[1]
    # wclist.sort(key=takeSecond,reverse=True)
    # print(wclist)
    
    #按词频排序
    wcList=list(strDict.items())
    print(wcList)
    wcList.sort(key=lambda x:x[1],reverse=True)
    print(wcList)
    
    #输出TOP(20)
    for i in range(20):
        print(wcList[i])
    
    
    # 列表的遍历
    
    cclist = ['wqdq', 'dqd', 'Awd', 313, '小四', 'dqd']
    print(cclist)
    cclist.append('gegeheh')
    print(cclist)
    cclist.pop(2)
    print(cclist)
    for i in cclist:
        print(i)
    
    # 元组的遍历
    
    tuple = ('jtfjhrr', 'rqfw f2q', 800, 10)
    print(tuple[2])
    for i in tuple:
        print(i)
    
    # 字典的遍历
    
    dic = {'fhehe': '4w6436', 'jgdns': 7, '4w6436': 'First'}
    
    print('fhehe:', dic['fhehe'])
    print('4w6436:', dic['4w6436'])
    
    dic['4w6436'] = 8;
    dic['4w6436'] = "对接欧文机房的维护"
    
    print('4w6436:', dic['4w6436'])
    print('4w6436:', dic['4w6436'])
    
    for key in dic:
        print(key, ':', dic.get(key))
    
    # 集合的遍历
    
    a = set([1, 2, 3, 6, 5])
    print(a)
    
    a.add(4)
    print(a)
    a.add('uteru')
    print(a)
    
    a.remove(5)
    print(a)
    
    for i in a:
        print(i)
    

      

    #此次作业

    fo=open('ccc1015.txt','r',encoding='utf-8')
    strBig=fo.read().lower()
    fo.close()
    print(strBig)
    #字符串预处理:#大小写,标点符号,特殊符号
    sep=""".,:;!?"""
    for ch in sep:
        strBig=strBig.replace(ch,'')
    strlist=strBig.split()
    print(len(strlist),strlist)
    strSet=set(strlist)
    exclude={'is','be','be','I','we','the','in'}
    strSet=strSet-exclude
    print(len(strSet),strSet)
    strDict={}
    for word in strSet:
        strDict[word]=strlist.count(word)
    print(len(strDict),strDict)
    #按词频排序
    wcList=list(strDict.items())
    print(wcList)
    wcList.sort(key=lambda x:x[1],reverse=True)
    print(wcList)
    
    #输出TOP(20)
    for i in range(20):
        print(wcList[i])
    
    
    
    
    # 中文版
    
    
    #读取文本文件
    f = open('shengxu.txt','r',encoding='utf-8')
    story = f.read()
    f.close()
    print(story)
    
    #预处理
    sep = ',。:“”?!'''     #符号处理
    for ch in sep:
        story=story.replace(ch,' ')   #利用for循环语句把特殊符号替换成空格
        print(story)
    
    #中文分词:结巴
    import jieba
    cnStr = story
    #精确模式
    print(list(jieba.cut(cnStr)))
    
    # 分隔提取单词
    strList = list(jieba.cut(cnStr))
    print(len(strList), strList)
    # 单词计数字典
    strSet = set(strList)
    print(len(strSet), strSet)
    strDict = {}
    for word in strSet:
        strDict[word] = strList.count(word)
        # print(len(strDict),strDict)
    # 词频排序
    wcList = list(strDict.items())
    # print(wcList)
    wcList.sort(key=lambda x: x[1], reverse=True)
    # print(wcList)
    
    # 输出TOP10
    for i in range(10):
        print(wcList[i])
    

      

      

     

  • 相关阅读:
    04构建之法阅读笔记之四
    冲刺周期第十天
    冲刺周期第九天
    第二阶段冲刺--每日立会(4)
    第二阶段冲刺--每日立会(3)
    第二阶段冲刺--每日立会(2)
    第二阶段冲刺--每日立会(1)
    第十二周学习记录表
    第一阶段冲刺--每日立会(10)
    第一阶段冲刺--每日立会(9)
  • 原文地址:https://www.cnblogs.com/cc013/p/9789856.html
Copyright © 2011-2022 走看看