zoukankan      html  css  js  c++  java
  • 文本处理、词频统计与Simhash生成文档指纹

    text='''The Data Mining and Web Information System Research Group DMWIS at College of Computer Science and Technology in Jilin University is a team of faculty, master and doctoral candidate students. Our research interests are AI, data mining, machine learning, web mining, text-based information retrieval, natural language processing, and applications of deep learning to natural language processing'''
    # 去除标点符号
    for i in ',()?!.':
        text=text.replace(i,' ')
    temp = text.lower().split()
    dic = {}
    for word in temp:
        if word not in dic:
            dic[word] = 1
        else:
            dic[word] = dic[word] + 1
    for key in dic:
        print(key,dic[key],end=' , ')
    
    the 1 , data 2 , mining 3 , and 4 , web 2 , information 2 , system 1 , research 2 , group 1 , dmwis 1 , at 1 , college 1 , of 3 , computer 1 , science 1 , technology 1 , in 1 , jilin 1 , university 1 , is 1 , a 1 , team 1 , faculty 1 , master 1 , doctoral 1 , candidate 1 , students 1 , our 1 , interests 1 , are 1 , ai 1 , machine 1 , learning 2 , text-based 1 , retrieval 1 , natural 2 , language 2 , processing 2 , applications 1 , deep 1 , to 1 , 
    
    print(len(dic))
    
    41
    
    #生成41个不同的随机整数 0-255
    import random
    random_list=[0 for i in range(41)]
    vis=set()
    for i in range(41):
        t=random.randint(0,255)
        while t in vis:
            t=random.randint(0,255)
        random_list[i]=t
        vis.add(t)
    print(random_list)
    
    
    [87, 127, 40, 63, 225, 90, 107, 97, 71, 62, 91, 51, 2, 160, 188, 166, 20, 44, 95, 116, 119, 106, 241, 78, 8, 89, 167, 123, 184, 199, 16, 18, 114, 120, 183, 124, 232, 81, 82, 230, 211]
    

    进行异或

    ans=[0 for i in range(8)]
    for i in range(7,-1,-1):
        #print(1<<i)
        j=0
        for key in dic:
            if(i==7):
                print(key,bin(random_list[j]))
            if((1<<i&random_list[j])==0):
                ans[7-i]-=dic[key]
            else:
                ans[7-i]+=dic[key]
            j+=1
    print(ans)
    
    the 0b1010111
    data 0b1111111
    mining 0b101000
    and 0b111111
    web 0b11100001
    information 0b1011010
    system 0b1101011
    research 0b1100001
    group 0b1000111
    dmwis 0b111110
    at 0b1011011
    college 0b110011
    of 0b10
    computer 0b10100000
    science 0b10111100
    technology 0b10100110
    in 0b10100
    jilin 0b101100
    university 0b1011111
    is 0b1110100
    a 0b1110111
    team 0b1101010
    faculty 0b11110001
    master 0b1001110
    doctoral 0b1000
    candidate 0b1011001
    students 0b10100111
    our 0b1111011
    interests 0b10111000
    are 0b11000111
    ai 0b10000
    machine 0b10010
    learning 0b1110010
    text-based 0b1111000
    retrieval 0b10110111
    natural 0b1111100
    language 0b11101000
    processing 0b1010001
    applications 0b1010010
    deep 0b11100110
    to 0b11010011
    [-28, 10, 16, 10, 0, -10, 8, -4]
    
    [1 if i>0 else 0 for i in ans]
    
    [0, 1, 1, 1, 0, 0, 1, 0]
  • 相关阅读:
    初识ambari
    MySQL Split 函数
    行存储和列存储
    Hbase安装和错误
    mysql 常用自定义函数解析
    mysq l错误Table ‘./mysql/proc’ is marked as crashed and should be repaired
    MySql提示:The server quit without updating PID file(…)失败
    mysql 自定义函数
    hive 调优总结
    [css] line boxes
  • 原文地址:https://www.cnblogs.com/Tony100K/p/12731214.html
Copyright © 2011-2022 走看看